From be1b7109352111cbf0c2dc0ffd2a90a9f53aeeb5 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Fri, 22 Dec 2017 11:31:28 -0800
Subject: [PATCH] Verify Intel intrinsics against upstream definitions

This commit adds a new crate for testing that the intrinsics listed in this
crate do indeed match the upstream definition of each intrinsic. A
pre-downloaded XML description of all Intel intrinsics is checked in which is
then parsed in the `stdsimd-verify` crate to verify that everything we write
down is matched against the upstream definitions.

Currently the checks are pretty loose to get this compiling but a few intrinsics
were fixed as a result of this. For example:

* `_mm256_extract_epi8` - AVX2 intrinsic erroneously listed under AVX
* `_mm256_extract_epi16` - AVX2 intrinsic erroneously listed under AVX
* `_mm256_extract_epi32` - AVX2 intrinsic erroneously listed under AVX
* `_mm256_extract_epi64` - AVX2 intrinsic erroneously listed under AVX
* `_mm_tzcnt_32` - erroneously had `u32` in the name
* `_mm_tzcnt_64` - erroneously had `u64` in the name
* `_mm_cvtsi64_si128` - erroneously available on 32-bit platforms
* `_mm_cvtsi64x_si128` - erroneously available on 32-bit platforms
* `_mm_cvtsi128_si64` - erroneously available on 32-bit platforms
* `_mm_cvtsi128_si64x` - erroneously available on 32-bit platforms
* `_mm_extract_epi64` - erroneously available on 32-bit platforms
* `_mm_insert_epi64` - erroneously available on 32-bit platforms
* `_mm256_extract_epi16` - erroneously returned i32 instead of i16
* `_mm256_extract_epi8` - erroneously returned i32 instead of i8
* `_mm_shuffle_ps` - the mask argument was erroneously i32 instead of u32
* `_popcnt32` - the signededness of the argument and return were flipped
* `_popcnt64` - the signededness of the argument was flipped and the argument
  was too large bit-wise
* `_mm_tzcnt_32` - the return value's sign was flipped
* `_mm_tzcnt_64` - the return value's sign was flipped
* A good number of intrinsics used `imm8: i8` or `imm8: u8` instead of `imm8:
  i32` which Intel was using. (we were also internally inconsistent)
* A number of intrinsics working with `__m64` were instead working with i64/u64,
  so they're now corrected to operate with the vector types instead.

Currently the verifications performed are:

* Each name in Rust is defined in the XML document
* The arguments/return values all agree.
* The CPUID features listed in the XML document are all enabled in Rust as well.

The type matching right now is pretty loose and has a lot of questionable
changes. Future commits will touch these up to be more strict and require closer
adherence with Intel's own types. Otherwise types like `i32x8` (or any integers
with 256 bits) all match up to `__m256i` right now, althoguh this may want to
change in the future.

Finally we're also not testing the instruction listed in the XML right now.
There's a huge number of discrepancies between the instruction listed in the XML
and the instruction listed in `assert_instr`, and those'll need to be taken care
of in a future commit.

Closes #240
---
 .travis.yml                       |      2 +
 Cargo.toml                        |      1 +
 ci/run.sh                         |      4 +-
 coresimd/src/x86/i586/abm.rs      |     16 +-
 coresimd/src/x86/i586/avx.rs      |    165 +-
 coresimd/src/x86/i586/avx2.rs     |    146 +-
 coresimd/src/x86/i586/bmi.rs      |     29 +-
 coresimd/src/x86/i586/bmi2.rs     |      4 +-
 coresimd/src/x86/i586/sse.rs      |     43 +-
 coresimd/src/x86/i586/sse2.rs     |      7 +-
 coresimd/src/x86/i586/sse41.rs    |     20 +-
 coresimd/src/x86/i586/sse42.rs    |     60 +-
 coresimd/src/x86/i586/xsave.rs    |     14 +-
 coresimd/src/x86/i686/mmx.rs      |      2 +-
 coresimd/src/x86/i686/sse.rs      |     16 +-
 coresimd/src/x86/i686/sse2.rs     |     66 +-
 coresimd/src/x86/i686/sse41.rs    |     44 -
 coresimd/src/x86/i686/ssse3.rs    |      2 +-
 coresimd/src/x86/x86_64/mod.rs    |      3 +
 coresimd/src/x86/x86_64/sse2.rs   |     46 +
 coresimd/src/x86/x86_64/sse41.rs  |     49 +
 coresimd/src/x86/x86_64/xsave.rs  |     12 +-
 stdsimd-verify/.gitattributes     |      1 +
 stdsimd-verify/Cargo.toml         |     18 +
 stdsimd-verify/build.rs           |     24 +
 stdsimd-verify/src/lib.rs         |    248 +
 stdsimd-verify/tests/x86-intel.rs |    295 +
 stdsimd-verify/x86-intel.xml      | 134861 +++++++++++++++++++++++++++
 28 files changed, 135819 insertions(+), 379 deletions(-)
 create mode 100644 coresimd/src/x86/x86_64/sse41.rs
 create mode 100644 stdsimd-verify/.gitattributes
 create mode 100644 stdsimd-verify/Cargo.toml
 create mode 100644 stdsimd-verify/build.rs
 create mode 100644 stdsimd-verify/src/lib.rs
 create mode 100644 stdsimd-verify/tests/x86-intel.rs
 create mode 100644 stdsimd-verify/x86-intel.xml

diff --git a/.travis.yml b/.travis.yml
index 117f56f173..b0d424b637 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -24,6 +24,8 @@ matrix:
     - env: DOCUMENTATION
       install: true
       script: ci/dox.sh
+    - script: cargo test --manifest-path stdsimd-verify/Cargo.toml
+      install: true
     - env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
       script: |
         cargo install rustfmt-nightly --force
diff --git a/Cargo.toml b/Cargo.toml
index b64b2f8e67..93ce1939e4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,6 +12,7 @@ categories = ["hardware-support"]
 license = "MIT/Apache-2.0"
 
 [workspace]
+members = ["stdsimd-verify"]
 
 [badges]
 travis-ci = { repository = "BurntSushi/stdsimd" }
diff --git a/ci/run.sh b/ci/run.sh
index 51734e5fce..337b9e8176 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -22,7 +22,9 @@ echo "FEATURES=${FEATURES}"
 echo "OBJDUMP=${OBJDUMP}"
 
 cargo_test() {
-    cmd="cargo test --all --target=$TARGET --features $FEATURES --verbose $1 -- --nocapture $2"
+    cmd="cargo test --target=$TARGET --features $FEATURES $1"
+    cmd="$cmd -p coresimd -p stdsimd"
+    cmd="$cmd -- $2"
     $cmd
 }
 
diff --git a/coresimd/src/x86/i586/abm.rs b/coresimd/src/x86/i586/abm.rs
index 52dc991a84..2ca2cc1cc3 100644
--- a/coresimd/src/x86/i586/abm.rs
+++ b/coresimd/src/x86/i586/abm.rs
@@ -44,16 +44,16 @@ pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+popcnt"]
 #[cfg_attr(test, assert_instr(popcnt))]
-pub unsafe fn _popcnt32(x: u32) -> u32 {
-    x.count_ones()
+pub unsafe fn _popcnt32(x: i32) -> i32 {
+    x.count_ones() as i32
 }
 
 /// Counts the bits that are set.
 #[inline(always)]
 #[target_feature = "+popcnt"]
 #[cfg_attr(test, assert_instr(popcnt))]
-pub unsafe fn _popcnt64(x: u64) -> u64 {
-    x.count_ones() as u64
+pub unsafe fn _popcnt64(x: i64) -> i32 {
+    x.count_ones() as i32
 }
 
 #[cfg(test)]
@@ -64,21 +64,21 @@ mod tests {
 
     #[simd_test = "lzcnt"]
     unsafe fn _lzcnt_u32() {
-        assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32);
+        assert_eq!(abm::_lzcnt_u32(0b0101_1010), 25);
     }
 
     #[simd_test = "lzcnt"]
     unsafe fn _lzcnt_u64() {
-        assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64);
+        assert_eq!(abm::_lzcnt_u64(0b0101_1010), 57);
     }
 
     #[simd_test = "popcnt"]
     unsafe fn _popcnt32() {
-        assert_eq!(abm::_popcnt32(0b0101_1010u32), 4);
+        assert_eq!(abm::_popcnt32(0b0101_1010), 4);
     }
 
     #[simd_test = "popcnt"]
     unsafe fn _popcnt64() {
-        assert_eq!(abm::_popcnt64(0b0101_1010u64), 4);
+        assert_eq!(abm::_popcnt64(0b0101_1010), 4);
     }
 }
diff --git a/coresimd/src/x86/i586/avx.rs b/coresimd/src/x86/i586/avx.rs
index 6ea508bb88..efa842d9e2 100644
--- a/coresimd/src/x86/i586/avx.rs
+++ b/coresimd/src/x86/i586/avx.rs
@@ -607,69 +607,69 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
 }
 
 /// Equal (ordered, non-signaling)
-pub const _CMP_EQ_OQ: u8 = 0x00;
+pub const _CMP_EQ_OQ: i32 = 0x00;
 /// Less-than (ordered, signaling)
-pub const _CMP_LT_OS: u8 = 0x01;
+pub const _CMP_LT_OS: i32 = 0x01;
 /// Less-than-or-equal (ordered, signaling)
-pub const _CMP_LE_OS: u8 = 0x02;
+pub const _CMP_LE_OS: i32 = 0x02;
 /// Unordered (non-signaling)
-pub const _CMP_UNORD_Q: u8 = 0x03;
+pub const _CMP_UNORD_Q: i32 = 0x03;
 /// Not-equal (unordered, non-signaling)
-pub const _CMP_NEQ_UQ: u8 = 0x04;
+pub const _CMP_NEQ_UQ: i32 = 0x04;
 /// Not-less-than (unordered, signaling)
-pub const _CMP_NLT_US: u8 = 0x05;
+pub const _CMP_NLT_US: i32 = 0x05;
 /// Not-less-than-or-equal (unordered, signaling)
-pub const _CMP_NLE_US: u8 = 0x06;
+pub const _CMP_NLE_US: i32 = 0x06;
 /// Ordered (non-signaling)
-pub const _CMP_ORD_Q: u8 = 0x07;
+pub const _CMP_ORD_Q: i32 = 0x07;
 /// Equal (unordered, non-signaling)
-pub const _CMP_EQ_UQ: u8 = 0x08;
+pub const _CMP_EQ_UQ: i32 = 0x08;
 /// Not-greater-than-or-equal (unordered, signaling)
-pub const _CMP_NGE_US: u8 = 0x09;
+pub const _CMP_NGE_US: i32 = 0x09;
 /// Not-greater-than (unordered, signaling)
-pub const _CMP_NGT_US: u8 = 0x0a;
+pub const _CMP_NGT_US: i32 = 0x0a;
 /// False (ordered, non-signaling)
-pub const _CMP_FALSE_OQ: u8 = 0x0b;
+pub const _CMP_FALSE_OQ: i32 = 0x0b;
 /// Not-equal (ordered, non-signaling)
-pub const _CMP_NEQ_OQ: u8 = 0x0c;
+pub const _CMP_NEQ_OQ: i32 = 0x0c;
 /// Greater-than-or-equal (ordered, signaling)
-pub const _CMP_GE_OS: u8 = 0x0d;
+pub const _CMP_GE_OS: i32 = 0x0d;
 /// Greater-than (ordered, signaling)
-pub const _CMP_GT_OS: u8 = 0x0e;
+pub const _CMP_GT_OS: i32 = 0x0e;
 /// True (unordered, non-signaling)
-pub const _CMP_TRUE_UQ: u8 = 0x0f;
+pub const _CMP_TRUE_UQ: i32 = 0x0f;
 /// Equal (ordered, signaling)
-pub const _CMP_EQ_OS: u8 = 0x10;
+pub const _CMP_EQ_OS: i32 = 0x10;
 /// Less-than (ordered, non-signaling)
-pub const _CMP_LT_OQ: u8 = 0x11;
+pub const _CMP_LT_OQ: i32 = 0x11;
 /// Less-than-or-equal (ordered, non-signaling)
-pub const _CMP_LE_OQ: u8 = 0x12;
+pub const _CMP_LE_OQ: i32 = 0x12;
 /// Unordered (signaling)
-pub const _CMP_UNORD_S: u8 = 0x13;
+pub const _CMP_UNORD_S: i32 = 0x13;
 /// Not-equal (unordered, signaling)
-pub const _CMP_NEQ_US: u8 = 0x14;
+pub const _CMP_NEQ_US: i32 = 0x14;
 /// Not-less-than (unordered, non-signaling)
-pub const _CMP_NLT_UQ: u8 = 0x15;
+pub const _CMP_NLT_UQ: i32 = 0x15;
 /// Not-less-than-or-equal (unordered, non-signaling)
-pub const _CMP_NLE_UQ: u8 = 0x16;
+pub const _CMP_NLE_UQ: i32 = 0x16;
 /// Ordered (signaling)
-pub const _CMP_ORD_S: u8 = 0x17;
+pub const _CMP_ORD_S: i32 = 0x17;
 /// Equal (unordered, signaling)
-pub const _CMP_EQ_US: u8 = 0x18;
+pub const _CMP_EQ_US: i32 = 0x18;
 /// Not-greater-than-or-equal (unordered, non-signaling)
-pub const _CMP_NGE_UQ: u8 = 0x19;
+pub const _CMP_NGE_UQ: i32 = 0x19;
 /// Not-greater-than (unordered, non-signaling)
-pub const _CMP_NGT_UQ: u8 = 0x1a;
+pub const _CMP_NGT_UQ: i32 = 0x1a;
 /// False (ordered, signaling)
-pub const _CMP_FALSE_OS: u8 = 0x1b;
+pub const _CMP_FALSE_OS: i32 = 0x1b;
 /// Not-equal (ordered, signaling)
-pub const _CMP_NEQ_OS: u8 = 0x1c;
+pub const _CMP_NEQ_OS: i32 = 0x1c;
 /// Greater-than-or-equal (ordered, non-signaling)
-pub const _CMP_GE_OQ: u8 = 0x1d;
+pub const _CMP_GE_OQ: i32 = 0x1d;
 /// Greater-than (ordered, non-signaling)
-pub const _CMP_GT_OQ: u8 = 0x1e;
+pub const _CMP_GT_OQ: i32 = 0x1e;
 /// True (unordered, signaling)
-pub const _CMP_TRUE_US: u8 = 0x1f;
+pub const _CMP_TRUE_US: i32 = 0x1f;
 
 /// Compare packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
@@ -677,7 +677,7 @@ pub const _CMP_TRUE_US: u8 = 0x1f;
 #[inline(always)]
 #[target_feature = "+avx,+sse2"]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
-pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
+pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => { vcmppd(a, b, $imm8) }
     }
@@ -690,7 +690,7 @@ pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
-pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
+pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => { vcmppd256(a, b, $imm8) }
     }
@@ -703,7 +703,7 @@ pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx,+sse"]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
-pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => { vcmpps(a, b, $imm8) }
     }
@@ -716,7 +716,7 @@ pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
-pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
+pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
     macro_rules! call {
         ($imm8:expr) => { vcmpps256(a, b, $imm8) }
     }
@@ -731,7 +731,7 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
 #[inline(always)]
 #[target_feature = "+avx,+sse2"]
 #[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
-pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
+pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => { vcmpsd(a, b, $imm8) }
     }
@@ -746,7 +746,7 @@ pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+avx,+sse"]
 #[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
-pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => { vcmpss(a, b, $imm8) }
     }
@@ -862,48 +862,6 @@ pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
     __m128i::from(dst)
 }
 
-/// Extract an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
-/// integer containing the zero-extended integer data.
-///
-/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
-#[inline(always)]
-#[target_feature = "+avx"]
-// This intrinsic has no corresponding instruction.
-pub unsafe fn _mm256_extract_epi8(a: i8x32, imm8: i32) -> i32 {
-    let imm8 = (imm8 & 31) as u32;
-    (a.extract_unchecked(imm8) as i32) & 0xFF
-}
-
-/// Extract a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
-/// integer containing the zero-extended integer data.
-///
-/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
-#[inline(always)]
-#[target_feature = "+avx"]
-// This intrinsic has no corresponding instruction.
-pub unsafe fn _mm256_extract_epi16(a: i16x16, imm8: i32) -> i32 {
-    let imm8 = (imm8 & 15) as u32;
-    (a.extract_unchecked(imm8) as i32) & 0xFFFF
-}
-
-/// Extract a 32-bit integer from `a`, selected with `imm8`.
-#[inline(always)]
-#[target_feature = "+avx"]
-// This intrinsic has no corresponding instruction.
-pub unsafe fn _mm256_extract_epi32(a: i32x8, imm8: i32) -> i32 {
-    let imm8 = (imm8 & 7) as u32;
-    a.extract_unchecked(imm8)
-}
-
-/// Extract a 64-bit integer from `a`, selected with `imm8`.
-#[inline(always)]
-#[target_feature = "+avx"]
-// This intrinsic has no corresponding instruction.
-pub unsafe fn _mm256_extract_epi64(a: i64x4, imm8: i32) -> i64 {
-    let imm8 = (imm8 & 3) as u32;
-    a.extract_unchecked(imm8)
-}
-
 /// Zero the contents of all XMM or YMM registers.
 #[inline(always)]
 #[target_feature = "+avx"]
@@ -1138,7 +1096,7 @@ pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))]
-pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
+pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
     macro_rules! call {
         ($imm8:expr) => { vperm2f128ps256(a, b, $imm8) }
     }
@@ -1150,7 +1108,7 @@ pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
-pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
+pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => { vperm2f128pd256(a, b, $imm8) }
     }
@@ -1163,7 +1121,7 @@ pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
 pub unsafe fn _mm256_permute2f128_si256(
-    a: i32x8, b: i32x8, imm8: i8
+    a: i32x8, b: i32x8, imm8: i32
 ) -> i32x8 {
     macro_rules! call {
         ($imm8:expr) => { vperm2f128si256(a, b, $imm8) }
@@ -3146,47 +3104,6 @@ mod tests {
         assert_eq!(r, __m128i::from(e));
     }
 
-    #[simd_test = "avx"]
-    unsafe fn _mm256_extract_epi8() {
-        #[cfg_attr(rustfmt, rustfmt_skip)]
-        let a = i8x32::new(
-            -1, 1, 2, 3, 4, 5, 6, 7,
-            8, 9, 10, 11, 12, 13, 14, 15,
-            16, 17, 18, 19, 20, 21, 22, 23,
-            24, 25, 26, 27, 28, 29, 30, 31
-        );
-        let r1 = avx::_mm256_extract_epi8(a, 0);
-        let r2 = avx::_mm256_extract_epi8(a, 35);
-        assert_eq!(r1, 0xFF);
-        assert_eq!(r2, 3);
-    }
-
-    #[simd_test = "avx"]
-    unsafe fn _mm256_extract_epi16() {
-        let a =
-            i16x16::new(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r1 = avx::_mm256_extract_epi16(a, 0);
-        let r2 = avx::_mm256_extract_epi16(a, 19);
-        assert_eq!(r1, 0xFFFF);
-        assert_eq!(r2, 3);
-    }
-
-    #[simd_test = "avx"]
-    unsafe fn _mm256_extract_epi32() {
-        let a = i32x8::new(-1, 1, 2, 3, 4, 5, 6, 7);
-        let r1 = avx::_mm256_extract_epi32(a, 0);
-        let r2 = avx::_mm256_extract_epi32(a, 11);
-        assert_eq!(r1, -1);
-        assert_eq!(r2, 3);
-    }
-
-    #[simd_test = "avx"]
-    unsafe fn _mm256_extract_epi64() {
-        let a = i64x4::new(0, 1, 2, 3);
-        let r = avx::_mm256_extract_epi64(a, 3);
-        assert_eq!(r, 3);
-    }
-
     #[simd_test = "avx"]
     unsafe fn _mm256_zeroall() {
         avx::_mm256_zeroall();
diff --git a/coresimd/src/x86/i586/avx2.rs b/coresimd/src/x86/i586/avx2.rs
index 52d61bf2fd..31d996750b 100644
--- a/coresimd/src/x86/i586/avx2.rs
+++ b/coresimd/src/x86/i586/avx2.rs
@@ -713,7 +713,7 @@ pub unsafe fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm_i32gather_epi32(
-    slice: *const i32, offsets: i32x4, scale: i8
+    slice: *const i32, offsets: i32x4, scale: i32
 ) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdd(i32x4::splat(0), slice as *const i8, offsets, i32x4::splat(-1), $imm8))
@@ -729,7 +729,7 @@ pub unsafe fn _mm_i32gather_epi32(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_epi32(
-    src: i32x4, slice: *const i32, offsets: i32x4, mask: i32x4, scale: i8
+    src: i32x4, slice: *const i32, offsets: i32x4, mask: i32x4, scale: i32
 ) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdd(src, slice as *const i8, offsets, mask, $imm8))
@@ -744,7 +744,7 @@ pub unsafe fn _mm_mask_i32gather_epi32(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm256_i32gather_epi32(
-    slice: *const i32, offsets: i32x8, scale: i8
+    slice: *const i32, offsets: i32x8, scale: i32
 ) -> i32x8 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdd(i32x8::splat(0), slice as *const i8, offsets, i32x8::splat(-1), $imm8))
@@ -760,7 +760,7 @@ pub unsafe fn _mm256_i32gather_epi32(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_epi32(
-    src: i32x8, slice: *const i32, offsets: i32x8, mask: i32x8, scale: i8
+    src: i32x8, slice: *const i32, offsets: i32x8, mask: i32x8, scale: i32
 ) -> i32x8 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdd(src, slice as *const i8, offsets, mask, $imm8))
@@ -775,7 +775,7 @@ pub unsafe fn _mm256_mask_i32gather_epi32(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm_i32gather_ps(
-    slice: *const f32, offsets: i32x4, scale: i8
+    slice: *const f32, offsets: i32x4, scale: i32
 ) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdps(f32x4::splat(0.0), slice as *const i8, offsets, f32x4::splat(-1.0), $imm8))
@@ -791,7 +791,7 @@ pub unsafe fn _mm_i32gather_ps(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_ps(
-    src: f32x4, slice: *const f32, offsets: i32x4, mask: f32x4, scale: i8
+    src: f32x4, slice: *const f32, offsets: i32x4, mask: f32x4, scale: i32
 ) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdps(src, slice as *const i8, offsets, mask, $imm8))
@@ -806,7 +806,7 @@ pub unsafe fn _mm_mask_i32gather_ps(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm256_i32gather_ps(
-    slice: *const f32, offsets: i32x8, scale: i8
+    slice: *const f32, offsets: i32x8, scale: i32
 ) -> f32x8 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdps(f32x8::splat(0.0), slice as *const i8, offsets, f32x8::splat(-1.0), $imm8))
@@ -822,7 +822,7 @@ pub unsafe fn _mm256_i32gather_ps(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_ps(
-    src: f32x8, slice: *const f32, offsets: i32x8, mask: f32x8, scale: i8
+    src: f32x8, slice: *const f32, offsets: i32x8, mask: f32x8, scale: i32
 ) -> f32x8 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdps(src, slice as *const i8, offsets, mask, $imm8))
@@ -837,7 +837,7 @@ pub unsafe fn _mm256_mask_i32gather_ps(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm_i32gather_epi64(
-    slice: *const i64, offsets: i32x4, scale: i8
+    slice: *const i64, offsets: i32x4, scale: i32
 ) -> i64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdq(i64x2::splat(0), slice as *const i8, offsets, i64x2::splat(-1), $imm8))
@@ -853,7 +853,7 @@ pub unsafe fn _mm_i32gather_epi64(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_epi64(
-    src: i64x2, slice: *const i64, offsets: i32x4, mask: i64x2, scale: i8
+    src: i64x2, slice: *const i64, offsets: i32x4, mask: i64x2, scale: i32
 ) -> i64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdq(src, slice as *const i8, offsets, mask, $imm8))
@@ -868,7 +868,7 @@ pub unsafe fn _mm_mask_i32gather_epi64(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm256_i32gather_epi64(
-    slice: *const i64, offsets: i32x4, scale: i8
+    slice: *const i64, offsets: i32x4, scale: i32
 ) -> i64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdq(i64x4::splat(0), slice as *const i8, offsets, i64x4::splat(-1), $imm8))
@@ -884,7 +884,7 @@ pub unsafe fn _mm256_i32gather_epi64(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_epi64(
-    src: i64x4, slice: *const i64, offsets: i32x4, mask: i64x4, scale: i8
+    src: i64x4, slice: *const i64, offsets: i32x4, mask: i64x4, scale: i32
 ) -> i64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdq(src, slice as *const i8, offsets, mask, $imm8))
@@ -899,7 +899,7 @@ pub unsafe fn _mm256_mask_i32gather_epi64(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm_i32gather_pd(
-    slice: *const f64, offsets: i32x4, scale: i8
+    slice: *const f64, offsets: i32x4, scale: i32
 ) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdpd(f64x2::splat(0.0), slice as *const i8, offsets, f64x2::splat(-1.0), $imm8))
@@ -915,7 +915,7 @@ pub unsafe fn _mm_i32gather_pd(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_pd(
-    src: f64x2, slice: *const f64, offsets: i32x4, mask: f64x2, scale: i8
+    src: f64x2, slice: *const f64, offsets: i32x4, mask: f64x2, scale: i32
 ) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdpd(src, slice as *const i8, offsets, mask, $imm8))
@@ -930,7 +930,7 @@ pub unsafe fn _mm_mask_i32gather_pd(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm256_i32gather_pd(
-    slice: *const f64, offsets: i32x4, scale: i8
+    slice: *const f64, offsets: i32x4, scale: i32
 ) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdpd(f64x4::splat(0.0), slice as *const i8, offsets, f64x4::splat(-1.0), $imm8))
@@ -946,7 +946,7 @@ pub unsafe fn _mm256_i32gather_pd(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_pd(
-    src: f64x4, slice: *const f64, offsets: i32x4, mask: f64x4, scale: i8
+    src: f64x4, slice: *const f64, offsets: i32x4, mask: f64x4, scale: i32
 ) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdpd(src, slice as *const i8, offsets, mask, $imm8))
@@ -961,7 +961,7 @@ pub unsafe fn _mm256_mask_i32gather_pd(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm_i64gather_epi32(
-    slice: *const i32, offsets: i64x2, scale: i8
+    slice: *const i32, offsets: i64x2, scale: i32
 ) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqd(i32x4::splat(0), slice as *const i8, offsets, i32x4::splat(-1), $imm8))
@@ -977,7 +977,7 @@ pub unsafe fn _mm_i64gather_epi32(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_epi32(
-    src: i32x4, slice: *const i32, offsets: i64x2, mask: i32x4, scale: i8
+    src: i32x4, slice: *const i32, offsets: i64x2, mask: i32x4, scale: i32
 ) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqd(src, slice as *const i8, offsets, mask, $imm8))
@@ -992,7 +992,7 @@ pub unsafe fn _mm_mask_i64gather_epi32(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm256_i64gather_epi32(
-    slice: *const i32, offsets: i64x4, scale: i8
+    slice: *const i32, offsets: i64x4, scale: i32
 ) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqd(i32x4::splat(0), slice as *const i8, offsets, i32x4::splat(-1), $imm8))
@@ -1008,7 +1008,7 @@ pub unsafe fn _mm256_i64gather_epi32(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_epi32(
-    src: i32x4, slice: *const i32, offsets: i64x4, mask: i32x4, scale: i8
+    src: i32x4, slice: *const i32, offsets: i64x4, mask: i32x4, scale: i32
 ) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqd(src, slice as *const i8, offsets, mask, $imm8))
@@ -1023,7 +1023,7 @@ pub unsafe fn _mm256_mask_i64gather_epi32(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm_i64gather_ps(
-    slice: *const f32, offsets: i64x2, scale: i8
+    slice: *const f32, offsets: i64x2, scale: i32
 ) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqps(f32x4::splat(0.0), slice as *const i8, offsets, f32x4::splat(-1.0), $imm8))
@@ -1039,7 +1039,7 @@ pub unsafe fn _mm_i64gather_ps(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_ps(
-    src: f32x4, slice: *const f32, offsets: i64x2, mask: f32x4, scale: i8
+    src: f32x4, slice: *const f32, offsets: i64x2, mask: f32x4, scale: i32
 ) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqps(src, slice as *const i8, offsets, mask, $imm8))
@@ -1054,7 +1054,7 @@ pub unsafe fn _mm_mask_i64gather_ps(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm256_i64gather_ps(
-    slice: *const f32, offsets: i64x4, scale: i8
+    slice: *const f32, offsets: i64x4, scale: i32
 ) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqps(f32x4::splat(0.0), slice as *const i8, offsets, f32x4::splat(-1.0), $imm8))
@@ -1070,7 +1070,7 @@ pub unsafe fn _mm256_i64gather_ps(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_ps(
-    src: f32x4, slice: *const f32, offsets: i64x4, mask: f32x4, scale: i8
+    src: f32x4, slice: *const f32, offsets: i64x4, mask: f32x4, scale: i32
 ) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqps(src, slice as *const i8, offsets, mask, $imm8))
@@ -1085,7 +1085,7 @@ pub unsafe fn _mm256_mask_i64gather_ps(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm_i64gather_epi64(
-    slice: *const i64, offsets: i64x2, scale: i8
+    slice: *const i64, offsets: i64x2, scale: i32
 ) -> i64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqq(i64x2::splat(0), slice as *const i8, offsets, i64x2::splat(-1), $imm8))
@@ -1101,7 +1101,7 @@ pub unsafe fn _mm_i64gather_epi64(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_epi64(
-    src: i64x2, slice: *const i64, offsets: i64x2, mask: i64x2, scale: i8
+    src: i64x2, slice: *const i64, offsets: i64x2, mask: i64x2, scale: i32
 ) -> i64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqq(src, slice as *const i8, offsets, mask, $imm8))
@@ -1116,7 +1116,7 @@ pub unsafe fn _mm_mask_i64gather_epi64(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm256_i64gather_epi64(
-    slice: *const i64, offsets: i64x4, scale: i8
+    slice: *const i64, offsets: i64x4, scale: i32
 ) -> i64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqq(i64x4::splat(0), slice as *const i8, offsets, i64x4::splat(-1), $imm8))
@@ -1132,7 +1132,7 @@ pub unsafe fn _mm256_i64gather_epi64(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_epi64(
-    src: i64x4, slice: *const i64, offsets: i64x4, mask: i64x4, scale: i8
+    src: i64x4, slice: *const i64, offsets: i64x4, mask: i64x4, scale: i32
 ) -> i64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqq(src, slice as *const i8, offsets, mask, $imm8))
@@ -1147,7 +1147,7 @@ pub unsafe fn _mm256_mask_i64gather_epi64(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm_i64gather_pd(
-    slice: *const f64, offsets: i64x2, scale: i8
+    slice: *const f64, offsets: i64x2, scale: i32
 ) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqpd(f64x2::splat(0.0), slice as *const i8, offsets, f64x2::splat(-1.0), $imm8))
@@ -1163,7 +1163,7 @@ pub unsafe fn _mm_i64gather_pd(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_pd(
-    src: f64x2, slice: *const f64, offsets: i64x2, mask: f64x2, scale: i8
+    src: f64x2, slice: *const f64, offsets: i64x2, mask: f64x2, scale: i32
 ) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqpd(src, slice as *const i8, offsets, mask, $imm8))
@@ -1178,7 +1178,7 @@ pub unsafe fn _mm_mask_i64gather_pd(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm256_i64gather_pd(
-    slice: *const f64, offsets: i64x4, scale: i8
+    slice: *const f64, offsets: i64x4, scale: i32
 ) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqpd(f64x4::splat(0.0), slice as *const i8, offsets, f64x4::splat(-1.0), $imm8))
@@ -1194,7 +1194,7 @@ pub unsafe fn _mm256_i64gather_pd(
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_pd(
-    src: f64x4, slice: *const f64, offsets: i64x4, mask: f64x4, scale: i8
+    src: f64x4, slice: *const f64, offsets: i64x4, mask: f64x4, scale: i32
 ) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqpd(src, slice as *const i8, offsets, mask, $imm8))
@@ -2656,6 +2656,48 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
     __m256i::from(i8x32::from(a) ^ i8x32::from(b))
 }
 
+/// Extract an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
+#[inline(always)]
+#[target_feature = "+avx2"]
+// This intrinsic has no corresponding instruction.
+pub unsafe fn _mm256_extract_epi8(a: i8x32, imm8: i32) -> i8 {
+    let imm8 = (imm8 & 31) as u32;
+    a.extract_unchecked(imm8)
+}
+
+/// Extract a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
+#[inline(always)]
+#[target_feature = "+avx2"]
+// This intrinsic has no corresponding instruction.
+pub unsafe fn _mm256_extract_epi16(a: i16x16, imm8: i32) -> i16 {
+    let imm8 = (imm8 & 15) as u32;
+    a.extract_unchecked(imm8)
+}
+
+/// Extract a 32-bit integer from `a`, selected with `imm8`.
+#[inline(always)]
+#[target_feature = "+avx2"]
+// This intrinsic has no corresponding instruction.
+pub unsafe fn _mm256_extract_epi32(a: i32x8, imm8: i32) -> i32 {
+    let imm8 = (imm8 & 7) as u32;
+    a.extract_unchecked(imm8)
+}
+
+/// Extract a 64-bit integer from `a`, selected with `imm8`.
+#[inline(always)]
+#[target_feature = "+avx2"]
+// This intrinsic has no corresponding instruction.
+pub unsafe fn _mm256_extract_epi64(a: i64x4, imm8: i32) -> i64 {
+    let imm8 = (imm8 & 3) as u32;
+    a.extract_unchecked(imm8)
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx2.pabs.b"]
@@ -4923,4 +4965,44 @@ mod tests {
         assert_eq!(r, f64x4::new(0.0, 16.0, 64.0, 256.0));
     }
 
+    #[simd_test = "avx"]
+    unsafe fn _mm256_extract_epi8() {
+        #[cfg_attr(rustfmt, rustfmt_skip)]
+        let a = i8x32::new(
+            -1, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31
+        );
+        let r1 = avx2::_mm256_extract_epi8(a, 0);
+        let r2 = avx2::_mm256_extract_epi8(a, 35);
+        assert_eq!(r1, -1);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test = "avx2"]
+    unsafe fn _mm256_extract_epi16() {
+        let a =
+            i16x16::new(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r1 = avx2::_mm256_extract_epi16(a, 0);
+        let r2 = avx2::_mm256_extract_epi16(a, 19);
+        assert_eq!(r1, -1);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test = "avx2"]
+    unsafe fn _mm256_extract_epi32() {
+        let a = i32x8::new(-1, 1, 2, 3, 4, 5, 6, 7);
+        let r1 = avx2::_mm256_extract_epi32(a, 0);
+        let r2 = avx2::_mm256_extract_epi32(a, 11);
+        assert_eq!(r1, -1);
+        assert_eq!(r2, 3);
+    }
+
+    #[simd_test = "avx2"]
+    unsafe fn _mm256_extract_epi64() {
+        let a = i64x4::new(0, 1, 2, 3);
+        let r = avx2::_mm256_extract_epi64(a, 3);
+        assert_eq!(r, 3);
+    }
 }
diff --git a/coresimd/src/x86/i586/bmi.rs b/coresimd/src/x86/i586/bmi.rs
index 5f00a7c67f..f51a6d2f24 100644
--- a/coresimd/src/x86/i586/bmi.rs
+++ b/coresimd/src/x86/i586/bmi.rs
@@ -27,8 +27,8 @@ pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
-pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
-    _bextr2_u64(a, (start & 0xff_u64) | ((len & 0xff_u64) << 8_u64))
+pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
+    _bextr2_u64(a, ((start & 0xff) | ((len & 0xff) << 8)) as u64)
 }
 
 /// Extracts bits of `a` specified by `control` into
@@ -127,16 +127,6 @@ pub unsafe fn _blsr_u64(x: u64) -> u64 {
     x & (x.wrapping_sub(1))
 }
 
-/// Counts the number of trailing least significant zero bits.
-///
-/// When the source operand is 0, it returns its size in bits.
-#[inline(always)]
-#[target_feature = "+bmi"]
-#[cfg_attr(test, assert_instr(tzcnt))]
-pub unsafe fn _tzcnt_u16(x: u16) -> u16 {
-    x.trailing_zeros() as u16
-}
-
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
@@ -163,8 +153,8 @@ pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub unsafe fn _mm_tzcnt_u32(x: u32) -> u32 {
-    x.trailing_zeros()
+pub unsafe fn _mm_tzcnt_32(x: u32) -> i32 {
+    x.trailing_zeros() as i32
 }
 
 /// Counts the number of trailing least significant zero bits.
@@ -173,8 +163,8 @@ pub unsafe fn _mm_tzcnt_u32(x: u32) -> u32 {
 #[inline(always)]
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(tzcnt))]
-pub unsafe fn _mm_tzcnt_u64(x: u64) -> u64 {
-    x.trailing_zeros() as u64
+pub unsafe fn _mm_tzcnt_64(x: u64) -> i64 {
+    x.trailing_zeros() as i64
 }
 
 #[allow(dead_code)]
@@ -290,13 +280,6 @@ mod tests {
         assert_eq!(r, 0b0010_0000u64);
     }
 
-    #[simd_test = "bmi"]
-    unsafe fn _tzcnt_u16() {
-        assert_eq!(bmi::_tzcnt_u16(0b0000_0001u16), 0u16);
-        assert_eq!(bmi::_tzcnt_u16(0b0000_0000u16), 16u16);
-        assert_eq!(bmi::_tzcnt_u16(0b1001_0000u16), 4u16);
-    }
-
     #[simd_test = "bmi"]
     unsafe fn _tzcnt_u32() {
         assert_eq!(bmi::_tzcnt_u32(0b0000_0001u32), 0u32);
diff --git a/coresimd/src/x86/i586/bmi2.rs b/coresimd/src/x86/i586/bmi2.rs
index f32778063a..fca7ef4c2d 100644
--- a/coresimd/src/x86/i586/bmi2.rs
+++ b/coresimd/src/x86/i586/bmi2.rs
@@ -55,8 +55,8 @@ pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
 #[target_feature = "+bmi2"]
 #[cfg_attr(test, assert_instr(bzhi))]
 #[cfg(not(target_arch = "x86"))]
-pub unsafe fn _bzhi_u64(a: u64, index: u64) -> u64 {
-    x86_bmi2_bzhi_64(a, index)
+pub unsafe fn _bzhi_u64(a: u64, index: u32) -> u64 {
+    x86_bmi2_bzhi_64(a, index as u64)
 }
 
 /// Scatter contiguous low order bits of `a` to the result at the positions
diff --git a/coresimd/src/x86/i586/sse.rs b/coresimd/src/x86/i586/sse.rs
index 45f2d0ef70..7427515416 100644
--- a/coresimd/src/x86/i586/sse.rs
+++ b/coresimd/src/x86/i586/sse.rs
@@ -5,7 +5,7 @@ use core::ptr;
 
 use simd_llvm::simd_shuffle4;
 use v128::*;
-use v64::f32x2;
+use v64::*;
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
@@ -764,7 +764,7 @@ pub unsafe fn _mm_setzero_ps() -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(shufps, mask = 3))]
-pub unsafe fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: i32) -> f32x4 {
+pub unsafe fn _mm_shuffle_ps(a: f32x4, b: f32x4, mask: u32) -> f32x4 {
     let mask = (mask & 0xFF) as u8;
 
     macro_rules! shuffle_done {
@@ -884,7 +884,7 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
 /// let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
 /// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
 ///
-/// let r = unsafe { _mm_loadh_pi(a, data[..].as_ptr()) };
+/// let r = unsafe { _mm_loadh_pi(a, data[..].as_ptr() as *const _) };
 ///
 /// assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
 /// #
@@ -906,7 +906,7 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
            assert_instr(unpcklps))]
 // TODO: This function is actually not limited to floats, but that's what
 // what matches the C type most closely: (__m128, *const __m64) -> __m128
-pub unsafe fn _mm_loadh_pi(a: f32x4, p: *const f32) -> f32x4 {
+pub unsafe fn _mm_loadh_pi(a: f32x4, p: *const u8x8) -> f32x4 {
     let q = p as *const f32x2;
     let b: f32x2 = *q;
     let bb = simd_shuffle4(b, b, [0, 1, 0, 1]);
@@ -936,7 +936,7 @@ pub unsafe fn _mm_loadh_pi(a: f32x4, p: *const f32) -> f32x4 {
 /// let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
 /// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
 ///
-/// let r = unsafe { _mm_loadl_pi(a, data[..].as_ptr()) };
+/// let r = unsafe { _mm_loadl_pi(a, data[..].as_ptr() as *const _) };
 ///
 /// assert_eq!(r, f32x4::new(5.0, 6.0, 3.0, 4.0));
 /// #
@@ -957,7 +957,7 @@ pub unsafe fn _mm_loadh_pi(a: f32x4, p: *const f32) -> f32x4 {
 #[cfg_attr(all(test, target_arch = "x86", not(target_feature = "sse2")),
            assert_instr(movss))]
 // TODO: Like _mm_loadh_pi, this also isn't limited to floats.
-pub unsafe fn _mm_loadl_pi(a: f32x4, p: *const f32) -> f32x4 {
+pub unsafe fn _mm_loadl_pi(a: f32x4, p: *const u8x8) -> f32x4 {
     let q = p as *const f32x2;
     let b: f32x2 = *q;
     let bb = simd_shuffle4(b, b, [0, 1, 0, 1]);
@@ -1070,14 +1070,14 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> f32x4 {
 // On i586 (no SSE2) it just generates plain MOV instructions.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
            assert_instr(movhpd))]
-pub unsafe fn _mm_storeh_pi(p: *mut u64, a: f32x4) {
+pub unsafe fn _mm_storeh_pi(p: *mut u8x8, a: f32x4) {
     #[cfg(target_arch = "x86")]
     {
         // If this is a `f64x2` then on i586, LLVM generates fldl & fstpl which
         // is just silly
         let a64: u64x2 = mem::transmute(a);
         let a_hi = a64.extract(1);
-        *p = a_hi;
+        *p = mem::transmute(a_hi);
     }
     #[cfg(target_arch = "x86_64")]
     {
@@ -1103,14 +1103,14 @@ pub unsafe fn _mm_storeh_pi(p: *mut u64, a: f32x4) {
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2"),
                target_family = "windows"),
            assert_instr(movsd))]
-pub unsafe fn _mm_storel_pi(p: *mut u64, a: f32x4) {
+pub unsafe fn _mm_storel_pi(p: *mut u8x8, a: f32x4) {
     #[cfg(target_arch = "x86")]
     {
         // Same as for _mm_storeh_pi: i586 code gen would use floating point
         // stack.
         let a64: u64x2 = mem::transmute(a);
         let a_hi = a64.extract(0);
-        *p = a_hi;
+        *p = mem::transmute(a_hi);
     }
     #[cfg(target_arch = "x86_64")]
     {
@@ -1688,18 +1688,15 @@ pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: f32x4) {
 /// memory hint.
 #[inline(always)]
 #[target_feature = "+sse"]
-// generates movnti on i686 and x86_64 but just a mov on i586
-#[cfg_attr(all(test,
-               any(target_arch = "x86_64",
-                   all(target_arch = "x86", target_feature = "sse2"))),
-           assert_instr(movnti))]
-pub unsafe fn _mm_stream_pi(mem_addr: *mut i64, a: i64) {
+// #[cfg_attr(test, assert_instr(movntq))] // FIXME: llvm codegens wrong instr
+pub unsafe fn _mm_stream_pi(mem_addr: *mut i8x8, a: i8x8) {
     ::core::intrinsics::nontemporal_store(mem_addr, a);
 }
 
 #[cfg(test)]
 mod tests {
     use v128::*;
+    use v64::*;
     use x86::i586::sse;
     use stdsimd_test::simd_test;
     use test::black_box; // Used to inhibit constant-folding.
@@ -2971,7 +2968,7 @@ mod tests {
         let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
         let x: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
         let p = x[..].as_ptr();
-        let r = sse::_mm_loadh_pi(a, p);
+        let r = sse::_mm_loadh_pi(a, p as *const _);
         assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0));
     }
 
@@ -2980,7 +2977,7 @@ mod tests {
         let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
         let x: [f32; 4] = [5.0, 6.0, 7.0, 8.0];
         let p = x[..].as_ptr();
-        let r = sse::_mm_loadl_pi(a, p);
+        let r = sse::_mm_loadl_pi(a, p as *const _);
         assert_eq!(r, f32x4::new(5.0, 6.0, 3.0, 4.0));
     }
 
@@ -3052,7 +3049,7 @@ mod tests {
     unsafe fn _mm_storeh_pi() {
         let mut vals = [0.0f32; 8];
         let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
-        sse::_mm_storeh_pi(vals.as_mut_ptr() as *mut f32 as *mut u64, a);
+        sse::_mm_storeh_pi(vals.as_mut_ptr() as *mut _, a);
 
         assert_eq!(vals[0], 3.0);
         assert_eq!(vals[1], 4.0);
@@ -3063,7 +3060,7 @@ mod tests {
     unsafe fn _mm_storel_pi() {
         let mut vals = [0.0f32; 8];
         let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
-        sse::_mm_storel_pi(vals.as_mut_ptr() as *mut f32 as *mut u64, a);
+        sse::_mm_storel_pi(vals.as_mut_ptr() as *mut _, a);
 
         assert_eq!(vals[0], 1.0);
         assert_eq!(vals[1], 2.0);
@@ -3297,9 +3294,9 @@ mod tests {
 
     #[simd_test = "sse"]
     unsafe fn _mm_stream_pi() {
-        let a: i64 = 7;
-        let mut mem = ::std::boxed::Box::<i64>::new(-1);
-        sse::_mm_stream_pi(&mut *mem as *mut i64, a);
+        let a = i8x8::new(0, 0, 0, 0, 0, 0, 0, 7);
+        let mut mem = ::std::boxed::Box::<i8x8>::new(i8x8::splat(1));
+        sse::_mm_stream_pi(&mut *mem as *mut _ as *mut _, a);
         assert_eq!(a, *mem);
     }
 }
diff --git a/coresimd/src/x86/i586/sse2.rs b/coresimd/src/x86/i586/sse2.rs
index d9d05cd9e4..9ba19adb87 100644
--- a/coresimd/src/x86/i586/sse2.rs
+++ b/coresimd/src/x86/i586/sse2.rs
@@ -2804,9 +2804,9 @@ mod tests {
         let b =
             i8x16::new(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
         let r = sse2::_mm_cmpeq_epi8(a, b);
+        #[cfg_attr(rustfmt, rustfmt_skip)]
         assert_eq!(
             r,
-            #[cfg_attr(rustfmt, rustfmt_skip)]
             i8x16::new(
                 0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
             )
@@ -3090,9 +3090,9 @@ mod tests {
         let a = i16x8::new(0x80, -0x81, 0, 0, 0, 0, 0, 0);
         let b = i16x8::new(0, 0, 0, 0, 0, 0, -0x81, 0x80);
         let r = sse2::_mm_packs_epi16(a, b);
+        #[cfg_attr(rustfmt, rustfmt_skip)]
         assert_eq!(
             r,
-            #[cfg_attr(rustfmt, rustfmt_skip)]
             i8x16::new(
                 0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
             )
@@ -3139,7 +3139,8 @@ mod tests {
 
     #[simd_test = "sse2"]
     unsafe fn _mm_movemask_epi8() {
-        let a = i8x16::from(#[cfg_attr(rustfmt, rustfmt_skip)] u8x16::new(
+        #[cfg_attr(rustfmt, rustfmt_skip)]
+        let a = i8x16::from(u8x16::new(
             0b1000_0000, 0b0, 0b1000_0000, 0b01, 0b0101, 0b1111_0000, 0, 0,
                 0, 0, 0b1111_0000, 0b0101, 0b01, 0b1000_0000, 0b0, 0b1000_0000, ));
         let r = sse2::_mm_movemask_epi8(a);
diff --git a/coresimd/src/x86/i586/sse41.rs b/coresimd/src/x86/i586/sse41.rs
index d3fb5135bf..60f972f4d0 100644
--- a/coresimd/src/x86/i586/sse41.rs
+++ b/coresimd/src/x86/i586/sse41.rs
@@ -61,7 +61,7 @@ pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
-pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: u8) -> i16x8 {
+pub unsafe fn _mm_blend_epi16(a: i16x8, b: i16x8, imm8: i32) -> i16x8 {
     macro_rules! call {
         ($imm8:expr) => { pblendw(a, b, $imm8) }
     }
@@ -91,7 +91,7 @@ pub unsafe fn _mm_blendv_ps(a: f32x4, b: f32x4, mask: f32x4) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
-pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: u8) -> f64x2 {
+pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: i32) -> f64x2 {
     macro_rules! call {
         ($imm2:expr) => { blendpd(a, b, $imm2) }
     }
@@ -103,7 +103,7 @@ pub unsafe fn _mm_blend_pd(a: f64x2, b: f64x2, imm2: u8) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
-pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
+pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: i32) -> f32x4 {
     macro_rules! call {
         ($imm4:expr) => { blendps(a, b, $imm4) }
     }
@@ -116,7 +116,7 @@ pub unsafe fn _mm_blend_ps(a: f32x4, b: f32x4, imm4: u8) -> f32x4 {
 #[target_feature = "+sse4.1"]
 // TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8 = 0))]
-pub unsafe fn _mm_extract_ps(a: f32x4, imm8: u8) -> i32 {
+pub unsafe fn _mm_extract_ps(a: f32x4, imm8: i32) -> i32 {
     mem::transmute(a.extract(imm8 as u32 & 0b11))
 }
 
@@ -167,7 +167,7 @@ pub unsafe fn _mm_extract_epi32(a: i32x4, imm8: i32) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
-pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => { insertps(a, b, $imm8) }
     }
@@ -179,7 +179,7 @@ pub unsafe fn _mm_insert_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
-pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: u8) -> i8x16 {
+pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: i32) -> i8x16 {
     a.replace((imm8 & 0b1111) as u32, i)
 }
 
@@ -188,7 +188,7 @@ pub unsafe fn _mm_insert_epi8(a: i8x16, i: i8, imm8: u8) -> i8x16 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
-pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: u8) -> i32x4 {
+pub unsafe fn _mm_insert_epi32(a: i32x4, i: i32, imm8: i32) -> i32x4 {
     a.replace((imm8 & 0b11) as u32, i)
 }
 
@@ -391,7 +391,7 @@ pub unsafe fn _mm_cvtepu32_epi64(a: u32x4) -> i64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
-pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
+pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => { dppd(a, b, $imm8) }
     }
@@ -408,7 +408,7 @@ pub unsafe fn _mm_dp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
-pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
+pub unsafe fn _mm_dp_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => { dpps(a, b, $imm8) }
     }
@@ -705,7 +705,7 @@ pub unsafe fn _mm_mullo_epi32(a: i32x4, b: i32x4) -> i32x4 {
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
-pub unsafe fn _mm_mpsadbw_epu8(a: u8x16, b: u8x16, imm8: u8) -> u16x8 {
+pub unsafe fn _mm_mpsadbw_epu8(a: u8x16, b: u8x16, imm8: i32) -> u16x8 {
     macro_rules! call {
         ($imm8:expr) => { mpsadbw(a, b, $imm8) }
     }
diff --git a/coresimd/src/x86/i586/sse42.rs b/coresimd/src/x86/i586/sse42.rs
index caf2e36267..2e9b7dec38 100644
--- a/coresimd/src/x86/i586/sse42.rs
+++ b/coresimd/src/x86/i586/sse42.rs
@@ -8,49 +8,49 @@ use stdsimd_test::assert_instr;
 use v128::*;
 
 /// String contains unsigned 8-bit characters *(Default)*
-pub const _SIDD_UBYTE_OPS: i8 = 0b0000_0000;
+pub const _SIDD_UBYTE_OPS: i32 = 0b0000_0000;
 /// String contains unsigned 16-bit characters
-pub const _SIDD_UWORD_OPS: i8 = 0b0000_0001;
+pub const _SIDD_UWORD_OPS: i32 = 0b0000_0001;
 /// String contains signed 8-bit characters
-pub const _SIDD_SBYTE_OPS: i8 = 0b0000_0010;
+pub const _SIDD_SBYTE_OPS: i32 = 0b0000_0010;
 /// String contains unsigned 16-bit characters
-pub const _SIDD_SWORD_OPS: i8 = 0b0000_0011;
+pub const _SIDD_SWORD_OPS: i32 = 0b0000_0011;
 
 /// For each character in `a`, find if it is in `b` *(Default)*
-pub const _SIDD_CMP_EQUAL_ANY: i8 = 0b0000_0000;
+pub const _SIDD_CMP_EQUAL_ANY: i32 = 0b0000_0000;
 /// For each character in `a`, determine if
 /// `b[0] <= c <= b[1] or b[1] <= c <= b[2]...`
-pub const _SIDD_CMP_RANGES: i8 = 0b0000_0100;
+pub const _SIDD_CMP_RANGES: i32 = 0b0000_0100;
 /// The strings defined by `a` and `b` are equal
-pub const _SIDD_CMP_EQUAL_EACH: i8 = 0b0000_1000;
+pub const _SIDD_CMP_EQUAL_EACH: i32 = 0b0000_1000;
 /// Search for the defined substring in the target
-pub const _SIDD_CMP_EQUAL_ORDERED: i8 = 0b0000_1100;
+pub const _SIDD_CMP_EQUAL_ORDERED: i32 = 0b0000_1100;
 
 /// Do not negate results *(Default)*
-pub const _SIDD_POSITIVE_POLARITY: i8 = 0b0000_0000;
+pub const _SIDD_POSITIVE_POLARITY: i32 = 0b0000_0000;
 /// Negate results
-pub const _SIDD_NEGATIVE_POLARITY: i8 = 0b0001_0000;
+pub const _SIDD_NEGATIVE_POLARITY: i32 = 0b0001_0000;
 /// Do not negate results before the end of the string
-pub const _SIDD_MASKED_POSITIVE_POLARITY: i8 = 0b0010_0000;
+pub const _SIDD_MASKED_POSITIVE_POLARITY: i32 = 0b0010_0000;
 /// Negate results only before the end of the string
-pub const _SIDD_MASKED_NEGATIVE_POLARITY: i8 = 0b0011_0000;
+pub const _SIDD_MASKED_NEGATIVE_POLARITY: i32 = 0b0011_0000;
 
 /// **Index only**: return the least significant bit *(Default)*
-pub const _SIDD_LEAST_SIGNIFICANT: i8 = 0b0000_0000;
+pub const _SIDD_LEAST_SIGNIFICANT: i32 = 0b0000_0000;
 /// **Index only**: return the most significant bit
-pub const _SIDD_MOST_SIGNIFICANT: i8 = 0b0100_0000;
+pub const _SIDD_MOST_SIGNIFICANT: i32 = 0b0100_0000;
 
 /// **Mask only**: return the bit mask
-pub const _SIDD_BIT_MASK: i8 = 0b0000_0000;
+pub const _SIDD_BIT_MASK: i32 = 0b0000_0000;
 /// **Mask only**: return the byte mask
-pub const _SIDD_UNIT_MASK: i8 = 0b0100_0000;
+pub const _SIDD_UNIT_MASK: i32 = 0b0100_0000;
 
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return the generated mask.
 #[inline(always)]
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpistrm, imm8 = 0))]
-pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i8) -> __m128i {
+pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
     macro_rules! call {
         ($imm8:expr) => { __m128i::from(pcmpistrm128(i8x16::from(a), i8x16::from(b), $imm8)) }
     }
@@ -270,7 +270,7 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i8) -> __m128i {
 #[inline(always)]
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
-pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i8) -> i32 {
+pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpistri128(i8x16::from(a), i8x16::from(b), $imm8) }
     }
@@ -283,7 +283,7 @@ pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i8) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
-pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i8) -> i32 {
+pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpistriz128(i8x16::from(a),
                                         i8x16::from(b),
@@ -298,7 +298,7 @@ pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i8) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
-pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i8) -> i32 {
+pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpistric128(i8x16::from(a), i8x16::from(b), $imm8) }
     }
@@ -311,7 +311,7 @@ pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i8) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
-pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i8) -> i32 {
+pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpistris128(i8x16::from(a), i8x16::from(b), $imm8) }
     }
@@ -323,7 +323,7 @@ pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i8) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
-pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i8) -> i32 {
+pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpistrio128(i8x16::from(a), i8x16::from(b), $imm8) }
     }
@@ -336,7 +336,7 @@ pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i8) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
-pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i8) -> i32 {
+pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpistria128(i8x16::from(a), i8x16::from(b), $imm8) }
     }
@@ -349,7 +349,7 @@ pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i8) -> i32 {
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpestrm, imm8 = 0))]
 pub unsafe fn _mm_cmpestrm(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> __m128i {
     macro_rules! call {
         ($imm8:expr) => { __m128i::from(pcmpestrm128(i8x16::from(a), la,
@@ -445,7 +445,7 @@ pub unsafe fn _mm_cmpestrm(
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestri(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpestri128(i8x16::from(a), la, i8x16::from(b), lb, $imm8) }
@@ -460,7 +460,7 @@ pub unsafe fn _mm_cmpestri(
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestrz(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpestriz128(i8x16::from(a), la, i8x16::from(b), lb, $imm8) }
@@ -475,7 +475,7 @@ pub unsafe fn _mm_cmpestrz(
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestrc(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpestric128(i8x16::from(a), la, i8x16::from(b), lb, $imm8) }
@@ -490,7 +490,7 @@ pub unsafe fn _mm_cmpestrc(
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestrs(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpestris128(i8x16::from(a), la, i8x16::from(b), lb, $imm8) }
@@ -505,7 +505,7 @@ pub unsafe fn _mm_cmpestrs(
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestro(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpestrio128(i8x16::from(a), la, i8x16::from(b), lb, $imm8) }
@@ -521,7 +521,7 @@ pub unsafe fn _mm_cmpestro(
 #[target_feature = "+sse4.2"]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestra(
-    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i8
+    a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
     macro_rules! call {
         ($imm8:expr) => { pcmpestria128(i8x16::from(a), la, i8x16::from(b), lb, $imm8) }
diff --git a/coresimd/src/x86/i586/xsave.rs b/coresimd/src/x86/i586/xsave.rs
index 8317c3ebd1..15e1f8b8ad 100644
--- a/coresimd/src/x86/i586/xsave.rs
+++ b/coresimd/src/x86/i586/xsave.rs
@@ -36,7 +36,7 @@ extern "C" {
 #[inline(always)]
 #[target_feature = "+xsave"]
 #[cfg_attr(test, assert_instr(xsave))]
-pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) -> () {
+pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
     xsave(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
@@ -49,7 +49,7 @@ pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave"]
 #[cfg_attr(test, assert_instr(xrstor))]
-pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) -> () {
+pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
     xrstor(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
 
@@ -65,7 +65,7 @@ const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;
 #[inline(always)]
 #[target_feature = "+xsave"]
 #[cfg_attr(test, assert_instr(xsetbv))]
-pub unsafe fn _xsetbv(a: u32, val: u64) -> () {
+pub unsafe fn _xsetbv(a: u32, val: u64) {
     xsetbv(a, (val >> 32) as u32, val as u32);
 }
 
@@ -88,7 +88,7 @@ pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
 #[inline(always)]
 #[target_feature = "+xsave,+xsaveopt"]
 #[cfg_attr(test, assert_instr(xsaveopt))]
-pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) -> () {
+pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
     xsaveopt(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
@@ -101,7 +101,7 @@ pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave,+xsavec"]
 #[cfg_attr(test, assert_instr(xsavec))]
-pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) -> () {
+pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
     xsavec(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
@@ -115,7 +115,7 @@ pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave,+xsaves"]
 #[cfg_attr(test, assert_instr(xsaves))]
-pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) -> () {
+pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
     xsaves(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
@@ -131,7 +131,7 @@ pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave,+xsaves"]
 #[cfg_attr(test, assert_instr(xrstors))]
-pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) -> () {
+pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
     xrstors(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
 
diff --git a/coresimd/src/x86/i686/mmx.rs b/coresimd/src/x86/i686/mmx.rs
index d4aa35b20a..df9cb98336 100644
--- a/coresimd/src/x86/i686/mmx.rs
+++ b/coresimd/src/x86/i686/mmx.rs
@@ -8,7 +8,7 @@
 //!
 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
 
-use v64::{__m64, i16x4, i32x2, i8x8};
+use v64::*;
 use core::mem;
 
 #[cfg(test)]
diff --git a/coresimd/src/x86/i686/sse.rs b/coresimd/src/x86/i686/sse.rs
index a5bd78b285..c63613543a 100644
--- a/coresimd/src/x86/i686/sse.rs
+++ b/coresimd/src/x86/i686/sse.rs
@@ -1,7 +1,7 @@
 //! `i686` Streaming SIMD Extensions (SSE)
 
 use v128::f32x4;
-use v64::{__m64, i16x4, i32x2, i8x8, u16x4, u8x8};
+use v64::*;
 use core::mem;
 use x86::i586;
 use x86::i686::mmx;
@@ -184,7 +184,7 @@ pub unsafe fn _m_pavgw(a: u16x4, b: u16x4) -> u16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(psadbw))]
-pub unsafe fn _mm_sad_pu8(a: u8x8, b: u8x8) -> u64 {
+pub unsafe fn _mm_sad_pu8(a: u8x8, b: u8x8) -> u16x4 {
     mem::transmute(psadbw(mem::transmute(a), mem::transmute(b)))
 }
 
@@ -195,8 +195,8 @@ pub unsafe fn _mm_sad_pu8(a: u8x8, b: u8x8) -> u64 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(psadbw))]
-pub unsafe fn _m_psadbw(a: u8x8, b: u8x8) -> u64 {
-    _mm_sad_pu8(a, b)
+pub unsafe fn _m_psadbw(a: u8x8, b: u8x8) -> u16x4 {
+    mem::transmute(_mm_sad_pu8(a, b))
 }
 
 /// Converts two elements of a 64-bit vector of [2 x i32] into two
@@ -330,7 +330,7 @@ pub unsafe fn _m_pmovmskb(a: i16x4) -> i32 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
-pub unsafe fn _mm_shuffle_pi16(a: i16x4, imm8: i8) -> i16x4 {
+pub unsafe fn _mm_shuffle_pi16(a: i16x4, imm8: i32) -> i16x4 {
     macro_rules! call {
         ($imm8:expr) => { mem::transmute(pshufw(mem::transmute(a), $imm8)) }
     }
@@ -342,7 +342,7 @@ pub unsafe fn _mm_shuffle_pi16(a: i16x4, imm8: i8) -> i16x4 {
 #[inline(always)]
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
-pub unsafe fn _m_pshufw(a: i16x4, imm8: i8) -> i16x4 {
+pub unsafe fn _m_pshufw(a: i16x4, imm8: i32) -> i16x4 {
     _mm_shuffle_pi16(a, imm8)
 }
 
@@ -485,10 +485,10 @@ mod tests {
         let a = u8x8::new(255, 254, 253, 252, 1, 2, 3, 4);
         let b = u8x8::new(0, 0, 0, 0, 2, 1, 2, 1);
         let r = sse::_mm_sad_pu8(a, b);
-        assert_eq!(r, 1020);
+        assert_eq!(r, u16x4::new(1020, 0, 0, 0));
 
         let r = sse::_m_psadbw(a, b);
-        assert_eq!(r, 1020);
+        assert_eq!(r, u16x4::new(1020, 0, 0, 0));
     }
 
     #[simd_test = "sse"]
diff --git a/coresimd/src/x86/i686/sse2.rs b/coresimd/src/x86/i686/sse2.rs
index c1b52db446..6a2d4f3fb3 100644
--- a/coresimd/src/x86/i686/sse2.rs
+++ b/coresimd/src/x86/i686/sse2.rs
@@ -2,7 +2,7 @@
 
 use core::mem;
 use v128::*;
-use v64::{__m64, i32x2};
+use v64::*;
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
@@ -25,56 +25,22 @@ pub unsafe fn _mm_cvtsi64x_sd(a: f64x2, b: i64) -> f64x2 {
     _mm_cvtsi64_sd(a, b)
 }
 
-/// Return a vector whose lowest element is `a` and all higher elements are
-/// `0`.
-#[inline(always)]
-#[target_feature = "+sse2"]
-// no particular instruction to test
-pub unsafe fn _mm_cvtsi64_si128(a: i64) -> i64x2 {
-    i64x2::new(a, 0)
-}
-
-/// Return a vector whose lowest element is `a` and all higher elements are
-/// `0`.
-#[inline(always)]
-#[target_feature = "+sse2"]
-// no particular instruction to test
-pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> i64x2 {
-    _mm_cvtsi64_si128(a)
-}
-
-/// Return the lowest element of `a`.
-#[inline(always)]
-#[target_feature = "+sse2"]
-// no particular instruction to test
-pub unsafe fn _mm_cvtsi128_si64(a: i64x2) -> i64 {
-    a.extract(0)
-}
-
-/// Return the lowest element of `a`.
-#[inline(always)]
-#[target_feature = "+sse2"]
-// no particular instruction to test
-pub unsafe fn _mm_cvtsi128_si64x(a: i64x2) -> i64 {
-    _mm_cvtsi128_si64(a)
-}
-
 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
 /// integer.
 #[inline(always)]
 #[target_feature = "+sse2"]
-// no particular instruction to test
-pub unsafe fn _mm_movepi64_pi64(a: i64x2) -> i64 {
-    a.extract(0)
+// #[cfg_attr(test, assert_instr(movdq2q))] // FIXME: llvm codegens wrong instr?
+pub unsafe fn _mm_movepi64_pi64(a: i64x2) -> i8x8 {
+    mem::transmute(a.extract(0))
 }
 
 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
 /// upper bits.
 #[inline(always)]
 #[target_feature = "+sse2"]
-// #[cfg_attr(test, assert_instr(movq2dq))] FIXME
-pub unsafe fn _mm_movpi64_epi64(a: i64) -> i64x2 {
-    i64x2::new(a, 0)
+// #[cfg_attr(test, assert_instr(movq2dq))] // FIXME: llvm codegens wrong instr?
+pub unsafe fn _mm_movpi64_epi64(a: i8x8) -> i64x2 {
+    i64x2::new(mem::transmute(a), 0)
 }
 
 /// Converts the two double-precision floating-point elements of a
@@ -112,7 +78,7 @@ mod tests {
     use stdsimd_test::simd_test;
 
     use v128::*;
-    use v64::i32x2;
+    use v64::*;
     use x86::i686::sse2;
 
     #[simd_test = "sse2"]
@@ -122,27 +88,15 @@ mod tests {
         assert_eq!(r, f64x2::new(5.0, 3.5));
     }
 
-    #[simd_test = "sse2"]
-    unsafe fn _mm_cvtsi64_si128() {
-        let r = sse2::_mm_cvtsi64_si128(5);
-        assert_eq!(r, i64x2::new(5, 0));
-    }
-
-    #[simd_test = "sse2"]
-    unsafe fn _mm_cvtsi128_si64() {
-        let r = sse2::_mm_cvtsi128_si64(i64x2::new(5, 0));
-        assert_eq!(r, 5);
-    }
-
     #[simd_test = "sse2"]
     unsafe fn _mm_movepi64_pi64() {
         let r = sse2::_mm_movepi64_pi64(i64x2::new(5, 0));
-        assert_eq!(r, 5);
+        assert_eq!(r, i8x8::new(5, 0, 0, 0, 0, 0, 0, 0));
     }
 
     #[simd_test = "sse2"]
     unsafe fn _mm_movpi64_epi64() {
-        let r = sse2::_mm_movpi64_epi64(5);
+        let r = sse2::_mm_movpi64_epi64(i8x8::new(5, 0, 0, 0, 0, 0, 0, 0));
         assert_eq!(r, i64x2::new(5, 0));
     }
 
diff --git a/coresimd/src/x86/i686/sse41.rs b/coresimd/src/x86/i686/sse41.rs
index 28c2d78f7f..9a9810b916 100644
--- a/coresimd/src/x86/i686/sse41.rs
+++ b/coresimd/src/x86/i686/sse41.rs
@@ -15,31 +15,6 @@ extern "C" {
     fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
 }
 
-/// Extract an 64-bit integer from `a` selected with `imm8`
-#[inline(always)]
-#[target_feature = "+sse4.1"]
-// TODO: Add test for Windows
-#[cfg_attr(all(test, not(windows), target_arch = "x86_64"),
-           assert_instr(pextrq, imm8 = 1))]
-// On x86 this emits 2 pextrd instructions
-#[cfg_attr(all(test, not(windows), target_arch = "x86"),
-           assert_instr(pextrd, imm8 = 1))]
-pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: i32) -> i64 {
-    let imm8 = (imm8 & 1) as u32;
-    a.extract_unchecked(imm8)
-}
-
-/// Return a copy of `a` with the 64-bit integer from `i` inserted at a
-/// location specified by `imm8`.
-#[inline(always)]
-#[target_feature = "+sse4.1"]
-#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(pinsrq, imm8 = 0))]
-// On x86 this emits 2 pinsrd instructions
-#[cfg_attr(all(test, target_arch = "x86"), assert_instr(pinsrd, imm8 = 0))]
-pub unsafe fn _mm_insert_epi64(a: i64x2, i: i64, imm8: u8) -> i64x2 {
-    a.replace((imm8 & 0b1) as u32, i)
-}
-
 /// Tests whether the specified bits in a 128-bit integer vector are all
 /// zeros.
 ///
@@ -165,25 +140,6 @@ mod tests {
     use x86::i686::sse41;
     use v128::*;
 
-    #[simd_test = "sse4.1"]
-    unsafe fn _mm_extract_epi64() {
-        let a = i64x2::new(0, 1);
-        let r = sse41::_mm_extract_epi64(a, 1);
-        assert_eq!(r, 1);
-        let r = sse41::_mm_extract_epi64(a, 3);
-        assert_eq!(r, 1);
-    }
-
-    #[simd_test = "sse4.1"]
-    unsafe fn _mm_insert_epi64() {
-        let a = i64x2::splat(0);
-        let e = i64x2::splat(0).replace(1, 32);
-        let r = sse41::_mm_insert_epi64(a, 32, 1);
-        assert_eq!(r, e);
-        let r = sse41::_mm_insert_epi64(a, 32, 3);
-        assert_eq!(r, e);
-    }
-
     #[simd_test = "sse4.1"]
     unsafe fn _mm_testz_si128() {
         let a = i8x16::splat(1);
diff --git a/coresimd/src/x86/i686/ssse3.rs b/coresimd/src/x86/i686/ssse3.rs
index d3eea79af2..e117ef441c 100644
--- a/coresimd/src/x86/i686/ssse3.rs
+++ b/coresimd/src/x86/i686/ssse3.rs
@@ -47,7 +47,7 @@ pub unsafe fn _mm_shuffle_pi8(a: u8x8, b: u8x8) -> u8x8 {
 #[inline(always)]
 #[target_feature = "+ssse3"]
 #[cfg_attr(test, assert_instr(palignr, n = 15))]
-pub unsafe fn _mm_alignr_pi8(a: u8x8, b: u8x8, n: u8) -> u8x8 {
+pub unsafe fn _mm_alignr_pi8(a: u8x8, b: u8x8, n: i32) -> u8x8 {
     macro_rules! call {
         ($imm8:expr) => {
             mem::transmute(palignrb(mem::transmute(a), mem::transmute(b), $imm8))
diff --git a/coresimd/src/x86/x86_64/mod.rs b/coresimd/src/x86/x86_64/mod.rs
index b5456b71ee..7225e7bf05 100644
--- a/coresimd/src/x86/x86_64/mod.rs
+++ b/coresimd/src/x86/x86_64/mod.rs
@@ -9,6 +9,9 @@ pub use self::sse::*;
 mod sse2;
 pub use self::sse2::*;
 
+mod sse41;
+pub use self::sse41::*;
+
 mod sse42;
 pub use self::sse42::*;
 
diff --git a/coresimd/src/x86/x86_64/sse2.rs b/coresimd/src/x86/x86_64/sse2.rs
index 25ab58c11d..b5c1fe4207 100644
--- a/coresimd/src/x86/x86_64/sse2.rs
+++ b/coresimd/src/x86/x86_64/sse2.rs
@@ -58,6 +58,40 @@ pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
     ::core::intrinsics::nontemporal_store(mem_addr, a);
 }
 
+/// Return a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+#[inline(always)]
+#[target_feature = "+sse2"]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+pub unsafe fn _mm_cvtsi64_si128(a: i64) -> i64x2 {
+    i64x2::new(a, 0)
+}
+
+/// Return a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+#[inline(always)]
+#[target_feature = "+sse2"]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> i64x2 {
+    _mm_cvtsi64_si128(a)
+}
+
+/// Return the lowest element of `a`.
+#[inline(always)]
+#[target_feature = "+sse2"]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+pub unsafe fn _mm_cvtsi128_si64(a: i64x2) -> i64 {
+    a.extract(0)
+}
+
+/// Return the lowest element of `a`.
+#[inline(always)]
+#[target_feature = "+sse2"]
+#[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+pub unsafe fn _mm_cvtsi128_si64x(a: i64x2) -> i64 {
+    _mm_cvtsi128_si64(a)
+}
+
 #[cfg(test)]
 mod tests {
     use stdsimd_test::simd_test;
@@ -107,4 +141,16 @@ mod tests {
         sse2::_mm_stream_si64(&mut *mem as *mut i64, a);
         assert_eq!(a, *mem);
     }
+
+    #[simd_test = "sse2"]
+    unsafe fn _mm_cvtsi64_si128() {
+        let r = sse2::_mm_cvtsi64_si128(5);
+        assert_eq!(r, i64x2::new(5, 0));
+    }
+
+    #[simd_test = "sse2"]
+    unsafe fn _mm_cvtsi128_si64() {
+        let r = sse2::_mm_cvtsi128_si64(i64x2::new(5, 0));
+        assert_eq!(r, 5);
+    }
 }
diff --git a/coresimd/src/x86/x86_64/sse41.rs b/coresimd/src/x86/x86_64/sse41.rs
new file mode 100644
index 0000000000..b9f0bd946f
--- /dev/null
+++ b/coresimd/src/x86/x86_64/sse41.rs
@@ -0,0 +1,49 @@
+use v128::*;
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+/// Extract an 64-bit integer from `a` selected with `imm8`
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+// TODO: Add test for Windows
+#[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8 = 1))]
+pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: i32) -> i64 {
+    let imm8 = (imm8 & 1) as u32;
+    a.extract_unchecked(imm8)
+}
+
+/// Return a copy of `a` with the 64-bit integer from `i` inserted at a
+/// location specified by `imm8`.
+#[inline(always)]
+#[target_feature = "+sse4.1"]
+#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(pinsrq, imm8 = 0))]
+pub unsafe fn _mm_insert_epi64(a: i64x2, i: i64, imm8: i32) -> i64x2 {
+    a.replace((imm8 & 0b1) as u32, i)
+}
+
+#[cfg(test)]
+mod tests {
+    use stdsimd_test::simd_test;
+    use x86::x86_64::sse41;
+    use v128::*;
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_extract_epi64() {
+        let a = i64x2::new(0, 1);
+        let r = sse41::_mm_extract_epi64(a, 1);
+        assert_eq!(r, 1);
+        let r = sse41::_mm_extract_epi64(a, 3);
+        assert_eq!(r, 1);
+    }
+
+    #[simd_test = "sse4.1"]
+    unsafe fn _mm_insert_epi64() {
+        let a = i64x2::splat(0);
+        let e = i64x2::splat(0).replace(1, 32);
+        let r = sse41::_mm_insert_epi64(a, 32, 1);
+        assert_eq!(r, e);
+        let r = sse41::_mm_insert_epi64(a, 32, 3);
+        assert_eq!(r, e);
+    }
+}
diff --git a/coresimd/src/x86/x86_64/xsave.rs b/coresimd/src/x86/x86_64/xsave.rs
index 03e6df59a1..6f8eaa6446 100644
--- a/coresimd/src/x86/x86_64/xsave.rs
+++ b/coresimd/src/x86/x86_64/xsave.rs
@@ -32,7 +32,7 @@ extern "C" {
 #[inline(always)]
 #[target_feature = "+xsave"]
 #[cfg_attr(test, assert_instr(xsave64))]
-pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) -> () {
+pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
     xsave64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
@@ -45,7 +45,7 @@ pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave"]
 #[cfg_attr(test, assert_instr(xrstor64))]
-pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) -> () {
+pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
     xrstor64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
 
@@ -59,7 +59,7 @@ pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave,+xsaveopt"]
 #[cfg_attr(test, assert_instr(xsaveopt64))]
-pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) -> () {
+pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
     xsaveopt64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
@@ -72,7 +72,7 @@ pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave,+xsavec"]
 #[cfg_attr(test, assert_instr(xsavec64))]
-pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) -> () {
+pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
     xsavec64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
@@ -86,7 +86,7 @@ pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave,+xsaves"]
 #[cfg_attr(test, assert_instr(xsaves64))]
-pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) -> () {
+pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
     xsaves64(mem_addr, (save_mask >> 32) as u32, save_mask as u32);
 }
 
@@ -102,7 +102,7 @@ pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave,+xsaves"]
 #[cfg_attr(test, assert_instr(xrstors64))]
-pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) -> () {
+pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) {
     xrstors64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
 
diff --git a/stdsimd-verify/.gitattributes b/stdsimd-verify/.gitattributes
new file mode 100644
index 0000000000..621fdea6f7
--- /dev/null
+++ b/stdsimd-verify/.gitattributes
@@ -0,0 +1 @@
+*.xml binary
diff --git a/stdsimd-verify/Cargo.toml b/stdsimd-verify/Cargo.toml
new file mode 100644
index 0000000000..9ac46a2e09
--- /dev/null
+++ b/stdsimd-verify/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "stdsimd-verify"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+
+[dependencies]
+proc-macro2 = { version = "0.1", features = ["unstable"] }
+quote = { git = 'https://github.com/dtolnay/quote' }
+syn = { git = 'https://github.com/dtolnay/syn', features = ["full"] }
+synom = { git = 'https://github.com/dtolnay/syn' }
+
+[lib]
+proc-macro = true
+
+[dev-dependencies]
+serde = "1.0"
+serde_derive = "1.0"
+serde-xml-rs = "0.2"
diff --git a/stdsimd-verify/build.rs b/stdsimd-verify/build.rs
new file mode 100644
index 0000000000..3273777679
--- /dev/null
+++ b/stdsimd-verify/build.rs
@@ -0,0 +1,24 @@
+use std::path::Path;
+
+fn main() {
+    let dir = Path::new(env!("CARGO_MANIFEST_DIR"));
+    let root = dir.parent().unwrap();
+    let root = root.join("coresimd/src/x86");
+    walk(&root);
+}
+
+fn walk(root: &Path) {
+    for file in root.read_dir().unwrap() {
+        let file = file.unwrap();
+        if file.file_type().unwrap().is_dir() {
+            walk(&file.path());
+            continue
+        }
+        let path = file.path();
+        if path.extension().and_then(|s| s.to_str()) != Some("rs") {
+            continue
+        }
+
+        println!("cargo:rerun-if-changed={}", path.display());
+    }
+}
diff --git a/stdsimd-verify/src/lib.rs b/stdsimd-verify/src/lib.rs
new file mode 100644
index 0000000000..c4b55b0162
--- /dev/null
+++ b/stdsimd-verify/src/lib.rs
@@ -0,0 +1,248 @@
+#![feature(proc_macro)]
+
+extern crate proc_macro;
+extern crate proc_macro2;
+extern crate syn;
+#[macro_use]
+extern crate quote;
+
+use std::path::Path;
+use std::fs::File;
+use std::io::Read;
+
+use proc_macro::TokenStream;
+use quote::Tokens;
+
+macro_rules! my_quote {
+    ($($t:tt)*) => (quote_spanned!(proc_macro2::Span::call_site(), $($t)*))
+}
+
+#[proc_macro]
+pub fn x86_functions(input: TokenStream) -> TokenStream {
+    let dir = Path::new(env!("CARGO_MANIFEST_DIR"));
+    let root = dir.parent().unwrap();
+    let root = root.join("coresimd/src/x86");
+
+    let mut files = Vec::new();
+    walk(&root, &mut files);
+
+    let mut functions = Vec::new();
+    for file in files {
+        for item in file.items {
+            match item {
+                syn::Item::Fn(f) => functions.push(f),
+                _ => {}
+            }
+        }
+    }
+
+    functions.retain(|f| {
+        match f.vis {
+            syn::Visibility::Public(_) => {}
+            _ => return false,
+        }
+        match f.unsafety {
+            syn::Unsafety::Unsafe(_) => {}
+            _ => return false,
+        }
+        f.attrs.iter()
+            .filter_map(|a| a.meta_item())
+            .any(|a| {
+                match a {
+                    syn::MetaItem::NameValue(i) => i.ident == "target_feature",
+                    _ => false,
+                }
+            })
+    });
+
+    let input = proc_macro2::TokenStream::from(input);
+
+    let functions = functions.iter()
+        .filter(|f| {
+            !f.ident.sym.as_str().starts_with("_mulx")
+        })
+        .map(|f| {
+            let name = f.ident;
+            // println!("{}", name);
+            let mut arguments = Vec::new();
+            for input in f.decl.inputs.iter().map(|s| s.into_item()) {
+                let ty = match *input {
+                    syn::FnArg::Captured(ref c) => &c.ty,
+                    _ => panic!("invalid argument on {}", name),
+                };
+                arguments.push(to_type(ty));
+            }
+            let ret = match f.decl.output {
+                syn::ReturnType::Default => my_quote! { None },
+                syn::ReturnType::Type(ref t, _) => {
+                    let ty = to_type(t);
+                    my_quote! { Some(#ty) }
+                }
+            };
+            let instrs = find_instrs(&f.attrs);
+            let target_feature = find_target_feature(f.ident, &f.attrs);
+            my_quote! {
+                Function {
+                    name: stringify!(#name),
+                    arguments: &[#(#arguments),*],
+                    ret: #ret,
+                    target_feature: #target_feature,
+                    instrs: &[#(stringify!(#instrs)),*],
+                }
+            }
+        })
+        .collect::<Vec<_>>();
+
+    let ret = my_quote! { #input: &[Function] = &[#(#functions),*]; };
+    // println!("{}", ret);
+    ret.into()
+}
+
+fn to_type(t: &syn::Type) -> Tokens {
+    match *t {
+        syn::Type::Path(ref p) => {
+            match extract_path_ident(&p.path).sym.as_str() {
+                "__m128i" => my_quote! { &I8x16 },
+                "__m256i" => my_quote! { &I8x32 },
+                "__m64" => my_quote! { &I8x8 },
+                "bool" => my_quote! { &BOOL },
+                "f32" => my_quote! { &F32 },
+                "f32x4" => my_quote! { &F32x4 },
+                "f32x8" => my_quote! { &F32x8 },
+                "f64" => my_quote! { &F64 },
+                "f64x2" => my_quote! { &F64x2 },
+                "f64x4" => my_quote! { &F64x4 },
+                "i16" => my_quote! { &I16 },
+                "i16x16" => my_quote! { &I16x16 },
+                "i16x4" => my_quote! { &I16x4 },
+                "i16x8" => my_quote! { &I16x8 },
+                "i32" => my_quote! { &I32 },
+                "i32x2" => my_quote! { &I32x2 },
+                "i32x4" => my_quote! { &I32x4 },
+                "i32x8" => my_quote! { &I32x8 },
+                "i64" => my_quote! { &I64 },
+                "i64x2" => my_quote! { &I64x2 },
+                "i64x4" => my_quote! { &I64x4 },
+                "i8" => my_quote! { &I8 },
+                "i8x16" => my_quote! { &I8x16 },
+                "i8x32" => my_quote! { &I8x32 },
+                "i8x8" => my_quote! { &I8x8 },
+                "u16x4" => my_quote! { &U16x4 },
+                "u16x8" => my_quote! { &U16x8 },
+                "u32" => my_quote! { &U32 },
+                "u32x2" => my_quote! { &U32x2 },
+                "u32x4" => my_quote! { &U32x4 },
+                "u32x8" => my_quote! { &U32x8 },
+                "u64" => my_quote! { &U64 },
+                "u64x2" => my_quote! { &U64x2 },
+                "u64x4" => my_quote! { &U64x4 },
+                "u8" => my_quote! { &U8 },
+                "u16" => my_quote! { &U16 },
+                "u8x16" => my_quote! { &U8x16 },
+                "u8x32" => my_quote! { &U8x32 },
+                "u16x16" => my_quote! { &U16x16 },
+                "u8x8" => my_quote! { &U8x8 },
+                s => panic!("unspported type: {}", s),
+            }
+        }
+        syn::Type::Ptr(syn::TypePtr { ref ty, .. }) |
+        syn::Type::Reference(syn::TypeReference { ref ty, .. }) => {
+            let tokens = to_type(&ty.ty);
+            my_quote! { &Type::Ptr(#tokens) }
+        }
+        syn::Type::Slice(_) => panic!("unsupported slice"),
+        syn::Type::Array(_) => panic!("unsupported array"),
+        syn::Type::Tup(_) => panic!("unsupported tup"),
+        _ => panic!("unsupported type"),
+    }
+}
+
+fn extract_path_ident(path: &syn::Path) -> syn::Ident {
+    if path.leading_colon.is_some() {
+        panic!("unsupported leading colon in path")
+    }
+    if path.segments.len() != 1 {
+        panic!("unsupported path that needs name resolution")
+    }
+    match path.segments.get(0).item().arguments {
+        syn::PathArguments::None => {}
+        _ => panic!("unsupported path that has path arguments")
+    }
+    path.segments.get(0).item().ident
+}
+
+fn walk(root: &Path, files: &mut Vec<syn::File>) {
+    for file in root.read_dir().unwrap() {
+        let file = file.unwrap();
+        if file.file_type().unwrap().is_dir() {
+            walk(&file.path(), files);
+            continue
+        }
+        let path = file.path();
+        if path.extension().and_then(|s| s.to_str()) != Some("rs") {
+            continue
+        }
+
+        let mut contents = String::new();
+        File::open(&path).unwrap().read_to_string(&mut contents).unwrap();
+
+        files.push(syn::parse_str::<syn::File>(&contents).expect("failed to parse"));
+    }
+}
+
+fn find_instrs(attrs: &[syn::Attribute]) -> Vec<syn::Ident> {
+    attrs.iter()
+        .filter_map(|a| a.meta_item())
+        .filter_map(|a| {
+            match a {
+                syn::MetaItem::List(i) => {
+                    if i.ident == "cfg_attr" {
+                        Some(i.nested.into_vec())
+                    } else {
+                        None
+                    }
+                }
+                _ => None,
+            }
+        })
+        .filter_map(|list| list.into_iter().nth(1))
+        .filter_map(|nested| {
+            match nested {
+                syn::NestedMetaItem::MetaItem(syn::MetaItem::List(i)) => {
+                    if i.ident == "assert_instr" {
+                        Some(i.nested.into_vec())
+                    } else {
+                        None
+                    }
+                }
+                _ => None,
+            }
+        })
+        .filter_map(|list| list.into_iter().next())
+        .filter_map(|nested| {
+            match nested {
+                syn::NestedMetaItem::MetaItem(syn::MetaItem::Term(i)) => Some(i),
+                _ => None,
+            }
+        })
+        .collect()
+}
+
+fn find_target_feature(name: syn::Ident, attrs: &[syn::Attribute]) -> syn::Lit {
+    attrs.iter()
+        .filter_map(|a| a.meta_item())
+        .filter_map(|a| {
+            match a {
+                syn::MetaItem::NameValue(i) => {
+                    if i.ident == "target_feature" {
+                        Some(i.lit)
+                    } else {
+                        None
+                    }
+                }
+                _ => None,
+            }
+        })
+        .next()
+        .expect(&format!("failed to find target_feature for {}",name))
+}
diff --git a/stdsimd-verify/tests/x86-intel.rs b/stdsimd-verify/tests/x86-intel.rs
new file mode 100644
index 0000000000..82425ea261
--- /dev/null
+++ b/stdsimd-verify/tests/x86-intel.rs
@@ -0,0 +1,295 @@
+#![feature(proc_macro)]
+#![allow(bad_style)]
+
+#[macro_use]
+extern crate serde_derive;
+extern crate serde_xml_rs;
+extern crate stdsimd_verify;
+
+use std::collections::HashMap;
+
+use stdsimd_verify::x86_functions;
+
+struct Function {
+    name: &'static str,
+    arguments: &'static [&'static Type],
+    ret: Option<&'static Type>,
+    target_feature: &'static str,
+    instrs: &'static [&'static str],
+}
+
+static BOOL: Type = Type::Bool;
+static F32: Type = Type::PrimFloat(32);
+static F32x4: Type = Type::Float(32, 4);
+static F32x8: Type = Type::Float(32, 8);
+static F64: Type = Type::PrimFloat(64);
+static F64x2: Type = Type::Float(64, 2);
+static F64x4: Type = Type::Float(64, 4);
+static I16: Type = Type::PrimSigned(16);
+static I16x16: Type = Type::Signed(16, 16);
+static I16x4: Type = Type::Signed(16, 4);
+static I16x8: Type = Type::Signed(16, 8);
+static I32: Type = Type::PrimSigned(32);
+static I32x2: Type = Type::Signed(32, 2);
+static I32x4: Type = Type::Signed(32, 4);
+static I32x8: Type = Type::Signed(32, 8);
+static I64: Type = Type::PrimSigned(64);
+static I64x2: Type = Type::Signed(64, 2);
+static I64x4: Type = Type::Signed(64, 4);
+static I8: Type = Type::PrimSigned(8);
+static I8x16: Type = Type::Signed(8, 16);
+static I8x32: Type = Type::Signed(8, 32);
+static I8x8: Type = Type::Signed(8, 8);
+static U16: Type = Type::PrimUnsigned(16);
+static U16x16: Type = Type::Unsigned(16, 16);
+static U16x4: Type = Type::Unsigned(16, 4);
+static U16x8: Type = Type::Unsigned(16, 8);
+static U32: Type = Type::PrimUnsigned(32);
+static U32x2: Type = Type::Unsigned(32, 2);
+static U32x4: Type = Type::Unsigned(32, 4);
+static U32x8: Type = Type::Unsigned(32, 8);
+static U64: Type = Type::PrimUnsigned(64);
+static U64x2: Type = Type::Unsigned(64, 2);
+static U64x4: Type = Type::Unsigned(64, 4);
+static U8: Type = Type::PrimUnsigned(8);
+static U8x16: Type = Type::Unsigned(8, 16);
+static U8x32: Type = Type::Unsigned(8, 32);
+static U8x8: Type = Type::Unsigned(8, 8);
+
+#[derive(Debug)]
+enum Type {
+    Float(u8, u8),
+    PrimFloat(u8),
+    PrimSigned(u8),
+    PrimUnsigned(u8),
+    Ptr(&'static Type),
+    Signed(u8, u8),
+    Unsigned(u8, u8),
+    Bool,
+}
+
+x86_functions!(static FUNCTIONS);
+
+#[derive(Deserialize)]
+struct Data {
+    #[serde(rename = "intrinsic", default)]
+    intrinsics: Vec<Intrinsic>,
+}
+
+#[derive(Deserialize)]
+struct Intrinsic {
+    rettype: String,
+    name: String,
+    tech: String,
+    #[serde(rename = "CPUID", default)]
+    cpuid: Vec<String>,
+    #[serde(rename = "parameter", default)]
+    parameters: Vec<Parameter>,
+    #[serde(default)]
+    instruction: Vec<Instruction>,
+}
+
+#[derive(Deserialize)]
+struct Parameter {
+    #[serde(rename = "type")]
+    type_: String,
+}
+
+#[derive(Deserialize)]
+struct Instruction {
+    name: String,
+}
+
+#[test]
+fn verify_all_signatures() {
+    let xml = include_bytes!("../x86-intel.xml");
+    let xml = &xml[..];
+    let data: Data = serde_xml_rs::deserialize(xml).expect("failed to deserialize xml");
+    let mut map = HashMap::new();
+    for intrinsic in data.intrinsics.iter() {
+        // This intrinsic has multiple definitions in the XML, so just ignore it.
+        if intrinsic.name == "_mm_prefetch" {
+            continue
+        }
+
+        // These'll need to get added eventually, but right now they have some
+        // duplicate names in the XML which we're not dealing with yet
+        if intrinsic.tech == "AVX-512" {
+            continue
+        }
+
+        assert!(map.insert(&intrinsic.name[..], intrinsic).is_none());
+    }
+
+    for rust in FUNCTIONS {
+        // This was ignored above, we ignore it here as well.
+        if rust.name == "_mm_prefetch" {
+            continue
+        }
+
+        // these are all AMD-specific intrinsics
+        if rust.target_feature.contains("sse4a") ||
+            rust.target_feature.contains("tbm") {
+            continue
+        }
+
+        let intel = match map.get(rust.name) {
+            Some(i) => i,
+            None => panic!("missing intel definition for {}", rust.name),
+        };
+
+        // Verify that all `#[target_feature]` annotations are correct, ensuring
+        // that we've actually enabled the right instruction set for this
+        // intrinsic.
+        assert!(intel.cpuid.len() > 0, "missing cpuid for {}", rust.name);
+        for cpuid in intel.cpuid.iter() {
+            // this is needed by _xsave and probably some related intrinsics,
+            // but let's just skip it for now.
+            if *cpuid == "XSS" {
+                continue
+            }
+
+            let cpuid = cpuid
+                .chars()
+                .flat_map(|c| c.to_lowercase())
+                .collect::<String>();
+
+            // Normalize `bmi1` to `bmi` as apparently that's what we're calling
+            // it.
+            let cpuid = if cpuid == "bmi1" {
+                String::from("bmi")
+            } else {
+                cpuid
+            };
+
+            assert!(rust.target_feature.contains(&cpuid),
+                    "intel cpuid `{}` not in `{}` for {}",
+                    cpuid,
+                    rust.target_feature,
+                    rust.name);
+        }
+
+        // TODO: we should test this, but it generates too many failures right
+        // now
+        if false {
+            if rust.instrs.len() == 0 {
+                assert_eq!(intel.instruction.len(), 0,
+                           "instruction not listed for {}", rust.name);
+
+            // If intel doesn't list any instructions and we do then don't
+            // bother trying to look for instructions in intel, we've just got
+            // some extra assertions on our end.
+            } else if intel.instruction.len() > 0 {
+                for instr in rust.instrs.iter() {
+                    assert!(intel.instruction.iter().any(|a| a.name.starts_with(instr)),
+                            "intel failed to list `{}` as an instruction for `{}`",
+                            instr, rust.name);
+                }
+            }
+        }
+
+        // Make sure we've got the right return type.
+        match rust.ret {
+            Some(t) => equate(t, &intel.rettype, &rust.name),
+            None => {
+                assert!(intel.rettype == "" || intel.rettype == "void",
+                        "{} returns `{}` with intel, void in rust",
+                        rust.name, intel.rettype);
+            }
+        }
+
+        // If there's no arguments on Rust's side intel may list one "void"
+        // argument, so handle that here.
+        if rust.arguments.len() == 0 {
+            if intel.parameters.len() == 1 {
+                assert_eq!(intel.parameters[0].type_, "void");
+                continue
+            }
+        }
+
+        // Otherwise we want all parameters to be exactly the same
+        assert_eq!(rust.arguments.len(), intel.parameters.len(),
+                   "wrong number of arguments on {}", rust.name);
+        for (a, b) in intel.parameters.iter().zip(rust.arguments) {
+            equate(b, &a.type_, &intel.name);
+        }
+    }
+}
+
+fn equate(t: &Type, intel: &str, intrinsic: &str) {
+    let intel = intel.replace(" *", "*");
+    let intel = intel.replace(" const*", "*");
+    match (t, &intel[..]) {
+        (&Type::PrimFloat(32), "float") => {}
+        (&Type::PrimFloat(64), "double") => {}
+        (&Type::PrimSigned(16), "__int16") => {}
+        (&Type::PrimSigned(16), "short") => {}
+        (&Type::PrimSigned(32), "__int32") => {}
+        (&Type::PrimSigned(32), "const int") => {}
+        (&Type::PrimSigned(32), "int") => {}
+        (&Type::PrimSigned(64), "__int64") => {}
+        (&Type::PrimSigned(64), "long long") => {}
+        (&Type::PrimSigned(8), "__int8") => {}
+        (&Type::PrimSigned(8), "char") => {}
+        (&Type::PrimUnsigned(16), "unsigned short") => {}
+        (&Type::PrimUnsigned(32), "unsigned int") => {}
+        (&Type::PrimUnsigned(64), "unsigned __int64") => {}
+        (&Type::PrimUnsigned(8), "unsigned char") => {}
+
+        (&Type::Ptr(&Type::PrimFloat(32)), "float const*") => {}
+        (&Type::Ptr(&Type::PrimFloat(32)), "float*") => {}
+        (&Type::Ptr(&Type::PrimFloat(64)), "double*") => {}
+        (&Type::Ptr(&Type::PrimSigned(32)), "int*") => {}
+        (&Type::Ptr(&Type::PrimSigned(64)), "__int64*") => {}
+        (&Type::Ptr(&Type::PrimSigned(8)), "char*") => {}
+        (&Type::Ptr(&Type::PrimUnsigned(8)), "const void*") => {}
+        (&Type::Ptr(&Type::PrimUnsigned(8)), "void*") => {}
+
+        (&Type::Signed(a, b), "__m256i") |
+        (&Type::Unsigned(a, b), "__m256i") if (a as u32) * (b as u32) == 256 => {}
+
+        (&Type::Signed(a, b), "__m128i") |
+        (&Type::Unsigned(a, b), "__m128i") if a * b == 128 => {}
+
+        (&Type::Ptr(&Type::Signed(a, b)), "__m128i*") |
+        (&Type::Ptr(&Type::Unsigned(a, b)), "__m128i*") |
+        (&Type::Ptr(&Type::Signed(a, b)), "__m128i*") |
+        (&Type::Ptr(&Type::Unsigned(a, b)), "__m128i*") if a * b == 128 => {}
+        (&Type::Ptr(&Type::Signed(a, b)), "__m256i*") |
+        (&Type::Ptr(&Type::Unsigned(a, b)), "__m256i*") |
+        (&Type::Ptr(&Type::Signed(a, b)), "__m256i*") |
+        (&Type::Ptr(&Type::Unsigned(a, b)), "__m256i*") if (a as u32) * (b as u32) == 256 => {}
+
+        (&Type::Signed(a, b), "__m64") |
+        (&Type::Unsigned(a, b), "__m64") if a * b == 64 => {}
+        (&Type::Ptr(&Type::Signed(a, b)), "__m64*") |
+        (&Type::Ptr(&Type::Unsigned(a, b)), "__m64*") if a * b == 64 => {}
+
+        (&Type::Float(32, 4), "__m128") => {}
+        (&Type::Float(32, 8), "__m256") => {}
+        (&Type::Float(64, 2), "__m128d") => {}
+        (&Type::Float(64, 4), "__m256d") => {}
+
+        (&Type::Ptr(&Type::Float(32, 4)), "__m128*") => {}
+        (&Type::Ptr(&Type::Float(64, 2)), "__m128d*") => {}
+
+        // These two intrinsics return a 16-bit element but in Intel's
+        // intrinsics they're listed as returning an `int`.
+        (&Type::PrimSigned(16), "int") if intrinsic == "_mm_extract_pi16" => {}
+        (&Type::PrimSigned(16), "int") if intrinsic == "_m_pextrw" => {}
+
+        // This intrinsic takes an `i8` to get inserted into an i8 vector, but
+        // Intel says the argument is i32...
+        (&Type::PrimSigned(8), "int") if intrinsic == "_mm_insert_epi8" => {}
+
+        // This is a macro (?) in C which seems to mutate its arguments, but that
+        // means that we're taking pointers to arguments in rust as we're not
+        // exposing it as a macro.
+        (&Type::Ptr(&Type::Float(32, 4)), "__m128") if intrinsic == "_MM_TRANSPOSE4_PS" => {}
+
+        // TODO: these seems suspicious...
+        (&Type::Bool, "int") => {}
+
+        _ => panic!("failed to equate: `{}` and {:?} for {}", intel, t, intrinsic),
+    }
+}
diff --git a/stdsimd-verify/x86-intel.xml b/stdsimd-verify/x86-intel.xml
new file mode 100644
index 0000000000..c22a3adaec
--- /dev/null
+++ b/stdsimd-verify/x86-intel.xml
@@ -0,0 +1,134861 @@
+<intrinsics_list version='3.4' date='09/07/2017'>
+<intrinsic tech='MMX' rettype='__m64' name='_m_from_int64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__int64'/>
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name='movq' form='mm, r64'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__int64' name='_m_to_int64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m64'/>
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name='movq' form='r64, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='void' name='_m_empty'>
+	<CPUID>MMX</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures.</description>
+	<instruction name='emms' form=''/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_from_int'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='int'/>
+	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := 0
+	</operation>
+	<instruction name='movd' form='mm, r32'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='int' name='_m_to_int'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m64'/>
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name='movd' form='r32, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_packsswb'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".
+	</description>
+	<operation>
+dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
+dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
+dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
+dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
+dst[39:32] := Saturate_Int16_To_Int8 (b[15:0])
+dst[47:40] := Saturate_Int16_To_Int8 (b[31:16])
+dst[55:48] := Saturate_Int16_To_Int8 (b[47:32])
+dst[63:56] := Saturate_Int16_To_Int8 (b[63:48])
+	</operation>
+	<instruction name='packsswb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_packssdw'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
+dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
+dst[47:32] := Saturate_Int32_To_Int16 (b[31:0])
+dst[63:48] := Saturate_Int32_To_Int16 (b[63:32])
+	</operation>
+	<instruction name='packssdw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_packuswb'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
+dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
+dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
+dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
+dst[39:32] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
+dst[47:40] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
+dst[55:48] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
+dst[63:56] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
+	</operation>
+	<instruction name='packuswb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_punpckhbw'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]){
+	dst[7:0] := src1[39:32]
+	dst[15:8] := src2[39:32] 
+	dst[23:16] := src1[47:40]
+	dst[31:24] := src2[47:40]
+	dst[39:32] := src1[55:48]
+	dst[47:40] := src2[55:48]
+	dst[55:48] := src1[63:56]
+	dst[63:56] := src2[63:56]
+	RETURN dst[63:0]
+}	
+	
+dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction name='punpckhbw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_punpckhwd'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]){
+	dst[15:0] := src1[47:32]
+	dst[31:16] := src2[47:32]
+	dst[47:32] := src1[63:48]
+	dst[63:48] := src2[63:48]
+	RETURN dst[63:0]
+}
+
+dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction name='punpcklbw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_punpckhdq'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32]
+dst[63:32] := b[63:32]
+	</operation>
+	<instruction name='punpckhdq' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_punpcklbw'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_BYTES(src1[63:0], src2[63:0]){
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	RETURN dst[63:0]
+}	
+
+dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction name='punpcklbw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_punpcklwd'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_WORDS(src1[63:0], src2[63:0]){
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	RETURN dst[63:0]
+}	
+
+dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction name='punpcklwd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_punpckldq'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := b[31:0]
+	</operation>
+	<instruction name='punpckldq' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_paddb'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name='paddb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_paddw'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name='paddw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_paddd'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='paddd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_paddsb'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddsb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_paddsw'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddsw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_paddusb'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddusb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_paddusw'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddusw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psubb'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name='psubb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psubw'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name='psubw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psubd'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='psubd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psubsb'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name='psubsb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psubsw'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name='psubsw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psubusb'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name='psubusb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psubusw'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+	</operation>
+	<instruction name='psubusw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pmaddwd'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name='pmaddwd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pmulhw'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name='pmulhw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pmullw'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+	</operation>
+	<instruction name='pmullw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psllw'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psllw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psllwi'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psllw' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pslld'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pslld' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pslldi'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pslld' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psllq'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". </description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend(a[63:0] &lt;&lt; count[63:0])
+FI
+	</operation>
+	<instruction name='psllq' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psllqi'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". </description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend(a[63:0] &lt;&lt; imm8[7:0])
+FI
+	</operation>
+	<instruction name='psllq' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psraw'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := SignBit
+	ELSE
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psraw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psrawi'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := SignBit
+	ELSE
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psraw' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psrad'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := SignBit
+	ELSE
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrad' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psradi'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := SignBit
+	ELSE
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrad' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psrlw'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrlw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psrlwi'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrlw' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psrld'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrld' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psrldi'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrld' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psrlq'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". </description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend(a[63:0] &gt;&gt; count[63:0])
+FI
+	</operation>
+	<instruction name='psrlq' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_psrlqi'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". </description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend(a[63:0] &gt;&gt; imm8[7:0])
+FI
+	</operation>
+	<instruction name='psrlq' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pand'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] AND b[63:0])
+	</operation>
+	<instruction name='pand' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pandn'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := ((NOT a[63:0]) AND b[63:0])
+	</operation>
+	<instruction name='pandn' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_por'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] OR b[63:0])
+	</operation>
+	<instruction name='por' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pxor'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] XOR b[63:0])
+	</operation>
+	<instruction name='pxor' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpeqb'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpeqb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpeqw'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpeqw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpeqd'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpeqd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpgtb'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpgtw'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpgtd'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='void' name='_mm_empty'>
+	<CPUID>MMX</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures.</description>
+	<instruction name='emms' form=''/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_add_pi8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name='paddb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_add_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name='paddw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_add_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='paddd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_adds_pi8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddsb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_adds_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddsw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_adds_pu8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddusb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_adds_pu16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddusw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_sub_pi8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name='psubb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_sub_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name='psubw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_sub_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='psubd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_subs_pi8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name='psubsb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_subs_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name='psubsw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_subs_pu8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name='psubusb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_subs_pu16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+	</operation>
+	<instruction name='psubusw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_madd_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name='pmaddwd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_mulhi_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name='pmulhw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_mullo_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+	</operation>
+	<instruction name='pmullw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_sll_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psllw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_slli_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psllw' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_sll_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pslld' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_slli_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pslld' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_sll_si64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". </description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend(a[63:0] &lt;&lt; count[63:0])
+FI
+	</operation>
+	<instruction name='psllq' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_slli_si64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". </description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend(a[63:0] &lt;&lt; imm8[7:0])
+FI
+	</operation>
+	<instruction name='psllq' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_sra_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := SignBit
+	ELSE
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psraw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_srai_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := SignBit
+	ELSE
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psraw' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_sra_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := SignBit
+	ELSE
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrad' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_srai_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := SignBit
+	ELSE
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrad' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_srl_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrlw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_srli_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrlw' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_srl_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrld' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_srli_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrld' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_srl_si64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='count' type='__m64'/>
+	<description>Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". </description>
+	<operation>
+IF count[63:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend(a[63:0] &gt;&gt; count[63:0])
+FI
+	</operation>
+	<instruction name='psrlq' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_srli_si64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". </description>
+	<operation>
+IF imm8[7:0] &gt; 63
+	dst[63:0] := 0
+ELSE
+	dst[63:0] := ZeroExtend(a[63:0] &gt;&gt; imm8[7:0])
+FI
+	</operation>
+	<instruction name='psrlq' form='mm, imm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_and_si64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] AND b[63:0])
+	</operation>
+	<instruction name='pand' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_andnot_si64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := ((NOT a[63:0]) AND b[63:0])
+	</operation>
+	<instruction name='pandn' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_or_si64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] OR b[63:0])
+	</operation>
+	<instruction name='por' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_xor_si64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] XOR b[63:0])
+	</operation>
+	<instruction name='pxor' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpeq_pi8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpeqb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpeq_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpeqw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpeq_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpeqd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpgt_pi8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpgt_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpgt_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_cvtsi32_si64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='int'/>
+	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := 0
+	</operation>
+	<instruction name='movd' form='mm, r32'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='int' name='_mm_cvtsi64_si32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m64'/>
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name='movd' form='r32, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__int64' name='_mm_cvtm64_si64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m64'/>
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name='movq' form='r64, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_cvtsi64_m64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__int64'/>
+	<description>Copy 64-bit integer "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name='movq' form='mm, r64'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_setzero_si64'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<parameter varname='' type='void'/>
+	<description>Return vector of type __m64 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='pxor' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<parameter varname='e1' type='int'/>
+	<parameter varname='e0' type='int'/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<parameter varname='e3' type='short'/>
+	<parameter varname='e2' type='short'/>
+	<parameter varname='e1' type='short'/>
+	<parameter varname='e0' type='short'/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set_pi8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<parameter varname='e7' type='char'/>
+	<parameter varname='e6' type='char'/>
+	<parameter varname='e5' type='char'/>
+	<parameter varname='e4' type='char'/>
+	<parameter varname='e3' type='char'/>
+	<parameter varname='e2' type='char'/>
+	<parameter varname='e1' type='char'/>
+	<parameter varname='e0' type='char'/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set1_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='int'/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set1_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='short'/>
+	<description>Broadcast 16-bit integer "a" to all all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set1_pi8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='char'/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_setr_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<parameter varname='e1' type='int'/>
+	<parameter varname='e0' type='int'/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e1
+dst[63:32] := e0
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_setr_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<parameter varname='e3' type='short'/>
+	<parameter varname='e2' type='short'/>
+	<parameter varname='e1' type='short'/>
+	<parameter varname='e0' type='short'/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[15:0] := e3
+dst[31:16] := e2
+dst[47:32] := e1
+dst[63:48] := e0
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_setr_pi8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Set</category>
+	<parameter varname='e7' type='char'/>
+	<parameter varname='e6' type='char'/>
+	<parameter varname='e5' type='char'/>
+	<parameter varname='e4' type='char'/>
+	<parameter varname='e3' type='char'/>
+	<parameter varname='e2' type='char'/>
+	<parameter varname='e1' type='char'/>
+	<parameter varname='e0' type='char'/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e7
+dst[15:8] := e6
+dst[23:16] := e5
+dst[31:24] := e4
+dst[39:32] := e3
+dst[47:40] := e2
+dst[55:48] := e1
+dst[63:56] := e0
+	</operation>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_packs_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".
+	</description>
+	<operation>
+dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
+dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
+dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
+dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
+dst[39:32] := Saturate_Int16_To_Int8 (b[15:0])
+dst[47:40] := Saturate_Int16_To_Int8 (b[31:16])
+dst[55:48] := Saturate_Int16_To_Int8 (b[47:32])
+dst[63:56] := Saturate_Int16_To_Int8 (b[63:48])
+	</operation>
+	<instruction name='packsswb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_packs_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
+dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
+dst[47:32] := Saturate_Int32_To_Int16 (b[31:0])
+dst[63:48] := Saturate_Int32_To_Int16 (b[63:32])
+	</operation>
+	<instruction name='packssdw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_packs_pu16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
+dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
+dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
+dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
+dst[39:32] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
+dst[47:40] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
+dst[55:48] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
+dst[63:56] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
+	</operation>
+	<instruction name='packuswb' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_unpackhi_pi8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]){
+	dst[7:0] := src1[39:32]
+	dst[15:8] := src2[39:32] 
+	dst[23:16] := src1[47:40]
+	dst[31:24] := src2[47:40]
+	dst[39:32] := src1[55:48]
+	dst[47:40] := src2[55:48]
+	dst[55:48] := src1[63:56]
+	dst[63:56] := src2[63:56]
+	RETURN dst[63:0]
+}	
+
+dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction name='punpckhbw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_unpackhi_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]){
+	dst[15:0] := src1[47:32]
+	dst[31:16] := src2[47:32]
+	dst[47:32] := src1[63:48]
+	dst[63:48] := src2[63:48]
+	RETURN dst[63:0]
+}
+
+dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction name='punpcklbw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_unpackhi_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32]
+dst[63:32] := b[63:32]
+	</operation>
+	<instruction name='punpckhdq' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_unpacklo_pi8'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_BYTES(src1[63:0], src2[63:0]){
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	RETURN dst[63:0]
+}	
+
+dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0])
+	</operation>
+	<instruction name='punpcklbw' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_unpacklo_pi16'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_WORDS(src1[63:0], src2[63:0]){
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	RETURN dst[63:0]
+}	
+
+dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0])
+	</operation>
+	<instruction name='punpcklwd' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+<intrinsic tech='MMX' rettype='__m64' name='_mm_unpacklo_pi32'>
+	<type>Integer</type>
+	<CPUID>MMX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := b[31:0]
+	</operation>
+	<instruction name='punpckldq' form='mm, mm'/>
+	<header>mmintrin.h</header>
+</intrinsic>
+
+<intrinsic tech='SSE' sequence='true' rettype='' name='_MM_TRANSPOSE4_PS'>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='row0' type='__m128' />
+	<parameter varname='row1' type='__m128' />
+	<parameter varname='row2' type='__m128' />
+	<parameter varname='row3' type='__m128' />
+	<description>Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in "row0", "row1", "row2", and "row3", and store the transposed matrix in these vectors ("row0" now contains column 0, etc.).</description>
+	<operation>
+__m128 tmp3, tmp2, tmp1, tmp0;
+tmp0 = _mm_unpacklo_ps(row0, row1);
+tmp2 = _mm_unpacklo_ps(row2, row3);
+tmp1 = _mm_unpackhi_ps(row0, row1);
+tmp3 = _mm_unpackhi_ps(row2, row3);
+row0 = _mm_movelh_ps(tmp0, tmp2);
+row1 = _mm_movehl_ps(tmp2, tmp0);
+row2 = _mm_movelh_ps(tmp1, tmp3);
+row3 = _mm_movehl_ps(tmp3, tmp1);
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='unsigned int' name='_mm_getcsr'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void' />
+	<description>Get the unsigned 32-bit value of the MXCSR control and status register.</description>
+	<operation>
+dst[31:0] := MXCSR
+	</operation>
+	<instruction name='stmxcsr' form='MEMd'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='void' name='_mm_setcsr'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<parameter varname='a' type='unsigned int' />
+	<description>Set the MXCSR control and status register with the value in unsigned 32-bit integer "a".</description>
+	<operation>
+MXCSR := a[31:0]
+	</operation>
+	<instruction name='ldmxcsr' form='MEMd'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='unsigned int' name='_MM_GET_EXCEPTION_STATE'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<description>Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT</description>
+	<operation>
+dst[31:0] := MXCSR &amp; _MM_EXCEPT_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='void' name='_MM_SET_EXCEPTION_STATE'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<parameter varname='a' type='unsigned int' />
+	<description>Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT</description>
+	<operation>
+MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='unsigned int' name='_MM_GET_EXCEPTION_MASK'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<description>Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT</description>
+	<operation>
+dst[31:0] := MXCSR &amp; _MM_MASK_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='void' name='_MM_SET_EXCEPTION_MASK'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<parameter varname='a' type='unsigned int' />
+	<description>Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT</description>
+	<operation>
+MXCSR := a[31:0] AND ~_MM_MASK_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='unsigned int' name='_MM_GET_ROUNDING_MODE'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<description>Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO</description>
+	<operation>
+dst[31:0] := MXCSR &amp; _MM_ROUND_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='void' name='_MM_SET_ROUNDING_MODE'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<parameter varname='a' type='unsigned int' />
+	<description>Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO</description>
+	<operation>
+MXCSR := a[31:0] AND ~_MM_ROUND_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='unsigned int' name='_MM_GET_FLUSH_ZERO_MODE'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<description>Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF</description>
+	<operation>
+dst[31:0] := MXCSR &amp; _MM_FLUSH_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='void' name='_MM_SET_FLUSH_ZERO_MODE'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<parameter varname='a' type='unsigned int' />
+	<description>Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF</description>
+	<operation>
+MXCSR := a[31:0] AND ~_MM_FLUSH_MASK
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='void' name='_mm_prefetch'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<parameter varname='p' type='char const*' />
+	<parameter varname='i' type='int' />
+	<description>Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i".</description>
+	<instruction name='prefetchnta' form='mprefetch'/>
+	<instruction name='prefetcht0' form='mprefetch'/>
+	<instruction name='prefetcht1' form='mprefetch'/>
+	<instruction name='prefetcht2' form='mprefetch'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm_prefetch">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<parameter varname='p' type='char const*' />
+	<parameter varname='i' type='int' />
+	<description>Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i".</description>
+	<instruction name='vprefetch0' form='mprefetch' xed=''/>
+	<instruction name='vprefetch1' form='mprefetch' xed=''/>
+	<instruction name='vprefetch2' form='mprefetch' xed=''/>
+	<instruction name='vprefetchnta' form='mprefetch' xed=''/>
+	<instruction name='vprefetche0' form='mprefetch' xed=''/>
+	<instruction name='vprefetche1' form='mprefetch' xed=''/>
+	<instruction name='vprefetche2' form='mprefetch' xed=''/>
+	<instruction name='vprefetchenta' form='mprefetch' xed=''/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="void" name="_mm_prefetch">
+	<CPUID>PREFETCHWT1</CPUID>
+	<category>General Support</category>
+	<parameter varname='p' type='char const*' />
+	<parameter varname='i' type='int' />
+	<description>Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i".</description>
+	<instruction name='prefetchwt1' form='mprefetch' xed=''/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='void' name='_mm_sfence'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void' />
+	<description>Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order.</description>
+	<instruction name='sfence' form=''/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_max_pi16'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF a[i+15:i] &gt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pmaxsw' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_m_pmaxsw'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF a[i+15:i] &gt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pmaxsw' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_max_pu8'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF a[i+7:i] &gt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pmaxub' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_m_pmaxub'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF a[i+7:i] &gt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pmaxub' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_min_pi16'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF a[i+15:i] &lt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pminsw' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_m_pminsw'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF a[i+15:i] &lt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pminsw' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_min_pu8'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF a[i+7:i] &lt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pminub' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_m_pminub'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF a[i+7:i] &lt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pminub' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_mulhi_pu16'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name='pmulhuw' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_m_pmulhuw'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name='pmulhuw' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_avg_pu8'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name='pavgb' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_m_pavgb'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name='pavgb' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_avg_pu16'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name='pavgw' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_m_pavgw'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name='pavgw' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_sad_pu8'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+
+dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
+dst[63:16] := 0
+	</operation>
+	<instruction name='psadbw' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_m_psadbw'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+
+dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
+dst[63:16] := 0
+	</operation>
+	<instruction name='psadbw' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cvtsi32_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='int' />
+	<description>Convert the 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cvtsi2ss' form='xmm, r32'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cvt_si2ss'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='int' />
+	<description>Convert the 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cvtsi2ss' form='xmm, r32'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m128' name='_mm_cvtsi64_ss'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__int64' />
+	<description>Convert the 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='cvtsi2ss' form='xmm, r64'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m128' name='_mm_cvtpi32_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m64' />
+	<description>Convert packed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". </description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction name='cvtpi2ps' form='xmm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m128' name='_mm_cvt_pi2ps'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m64' />
+	<description>Convert packed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". </description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction name='cvtpi2ps' form='xmm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_cvtpi16_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m64' />
+	<description>Convert packed 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	m := j*32
+	dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_cvtpu16_ps'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m64' />
+	<description>Convert packed unsigned 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	m := j*32
+	dst[m+31:m] := Convert_UnsignedInt16_To_FP32(a[i+15:i])
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_cvtpi8_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m64' />
+	<description>Convert the lower packed 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*8
+	m := j*32
+	dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_cvtpu8_ps'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m64' />
+	<description>Convert the lower packed unsigned 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*8
+	m := j*32
+	dst[m+31:m] := Convert_UnsignedInt8_To_FP32(a[i+7:i])
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_cvtpi32x2_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='b' type='__m64' />
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", then covert the packed 32-bit integers in "a" to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of "dst". </description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='void' name='_mm_stream_pi'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__m64*' />
+	<parameter varname='a' type='__m64' />
+	<description>Store 64-bits of integer data from "a" into memory using a non-temporal memory hint.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name='movntq' form='m64, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='void' name='_mm_maskmove_si64'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='mask' type='__m64' />
+	<parameter varname='mem_addr' type='char*' />
+	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF mask[i+7]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='maskmovq' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='void' name='_m_maskmovq'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='mask' type='__m64' />
+	<parameter varname='mem_addr' type='char*' />
+	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF mask[i+7]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='maskmovq' form='mm, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='int' name='_mm_extract_pi16'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname="imm8" type='int' />
+	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[15:0] := (a[63:0] &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := 0
+	</operation>
+	<instruction name='pextrw' form='r32, mm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='int' name='_m_pextrw'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname="imm8" type='int' />
+	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[15:0] := (a[63:0] &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := 0
+	</operation>
+	<instruction name='pextrw' form='r32, mm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_insert_pi16'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='i' type='int' />
+	<parameter varname="imm8" type='int' />
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". </description>
+	<operation>
+dst[63:0] := a[63:0]
+sel := imm8[1:0]*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<instruction name='pinsrw' form='xmm, r32, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_m_pinsrw'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname='i' type='int' />
+	<parameter varname="imm8" type='int' />
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". </description>
+	<operation>
+dst[63:0] := a[63:0]
+sel := imm8[1:0]*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<instruction name='pinsrw' form='mm, r32, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='int' name='_mm_movemask_pi8'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m64' />
+	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+dst[MAX:8] := 0
+	</operation>
+	<instruction name='pmovmskb' form='r32, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='int' name='_m_pmovmskb'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m64' />
+	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+dst[MAX:8] := 0
+	</operation>
+	<instruction name='pmovmskb' form='r32, mm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_shuffle_pi16'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname="imm8" type='int' />
+	<description>Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[15:0] := src[15:0]
+	1:	tmp[15:0] := src[31:16]
+	2:	tmp[15:0] := src[47:32]
+	3:	tmp[15:0] := src[63:48]
+	ESAC
+	RETURN tmp[15:0]
+}
+
+dst[15:0] := SELECT4(a[63:0], imm8[1:0])
+dst[31:16] := SELECT4(a[63:0], imm8[3:2])
+dst[47:32] := SELECT4(a[63:0], imm8[5:4])
+dst[63:48] := SELECT4(a[63:0], imm8[7:6])
+	</operation>
+	<instruction name='pshufw' form='mm, mm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_m_pshufw'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64' />
+	<parameter varname="imm8" type='int' />
+	<description>Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[15:0] := src[15:0]
+	1:	tmp[15:0] := src[31:16]
+	2:	tmp[15:0] := src[47:32]
+	3:	tmp[15:0] := src[63:48]
+	ESAC
+	RETURN tmp[15:0]
+}
+
+dst[15:0] := SELECT4(a[63:0], imm8[1:0])
+dst[31:16] := SELECT4(a[63:0], imm8[3:2])
+dst[47:32] := SELECT4(a[63:0], imm8[5:4])
+dst[63:48] := SELECT4(a[63:0], imm8[7:6])
+	</operation>
+	<instruction name='pshufw' form='mm, mm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_add_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". </description>
+	<operation>
+dst[31:0] := a[31:0] + b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='addss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_add_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='addps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_sub_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='subss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_sub_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='subps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_mul_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] * b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='mulss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_mul_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='mulps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_div_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". </description>
+	<operation>
+dst[31:0] := a[31:0] / b[31:0]
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='divss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_div_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='divps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_sqrt_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128' />
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := SQRT(a[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='sqrtss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_sqrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128' />
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='sqrtps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_rcp_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128' />
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+dst[31:0] := APPROXIMATE(1.0/a[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='rcpss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_rcp_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128' />
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='rcpps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_rsqrt_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128' />
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+dst[31:0] := APPROXIMATE(1.0 / SQRT(a[31:0]))
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='rsqrtss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_rsqrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128' />
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
+ENDFOR
+	</operation>
+	<instruction name='rsqrtps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_min_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[31:0] := MIN(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='minss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_min_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='minps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_max_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[31:0] := MAX(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='maxss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_max_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='maxps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_and_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='andps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_andnot_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='andnps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_or_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='orps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_xor_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='xorps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpeq_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpeq_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmplt_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &lt; b[31:0] ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmplt_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &lt; b[i+31:i] ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmple_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &lt;= b[31:0] ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmple_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpgt_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &gt; b[31:0] ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpgt_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpge_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] &gt;= b[31:0] ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpge_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpneq_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpneq_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnlt_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := !( a[31:0] &lt; b[31:0] ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnlt_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := !( a[i+31:i] &lt; b[i+31:i] ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnle_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := !( a[31:0] &lt;= b[31:0] ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnle_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := !( a[i+31:i] &lt;= b[i+31:i] ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpngt_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := !( a[31:0] &gt; b[31:0] ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpngt_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := !( a[i+31:i] &gt; b[i+31:i] ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnge_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := !( a[31:0] &gt;= b[31:0] ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnge_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := !( a[i+31:i] &gt;= b[i+31:i] ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpord_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpord_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpunord_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := ( a[31:0] == NaN OR b[31:0] == NaN ) ? 0xffffffff : 0
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='cmpss' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpunord_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == NaN OR b[i+31:i] == NaN ) ? 0xffffffff : 0
+ENDFOR
+	</operation>
+	<instruction name='cmpps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='int' name='_mm_comieq_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] == b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='int' name='_mm_comilt_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] &lt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='int' name='_mm_comile_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] &lt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='int' name='_mm_comigt_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] &gt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='int' name='_mm_comige_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] &gt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='int' name='_mm_comineq_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[31:0] != b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomieq_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] == b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomilt_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] &lt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomile_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] &lt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomigt_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] &gt; b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomige_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] &gt;= b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomineq_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[31:0] != b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomiss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvtss_si32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction name='cvtss2si' form='r32, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvt_ss2si'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction name='cvtss2si' form='r32, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__int64' name='_mm_cvtss_si64'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction name='cvtss2si' form='r64, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='float' name='_mm_cvtss_f32'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Copy the lower single-precision (32-bit) floating-point element of "a" to "dst".</description>
+	<operation>dst[31:0] := a[31:0]</operation>
+	<instruction name='movss' form='m32, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_cvtps_pi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='cvtps2pi' form='mm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_cvt_ps2pi'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='cvtps2pi' form='mm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvttss_si32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction name='cvttss2si' form='r32, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvtt_ss2si'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction name='cvttss2si' form='r32, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__int64' name='_mm_cvttss_si64'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction name='cvttss2si' form='r64, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_cvttps_pi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='cvttps2pi' form='mm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m64' name='_mm_cvtt_ps2pi'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='cvttps2pi' form='mm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m64' name='_mm_cvtps_pi16'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	k := 32*j
+	dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m64' name='_mm_cvtps_pi8'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128' />
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 8-bit integers, and store the results in lower 4 elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 8*j
+	k := 32*j
+	dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_set_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='float' />
+	<description>Copy single-precision (32-bit) floating-point element "a" to the lower element of "dst", and zero the upper 3 elements.</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[127:32] := 0
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_set1_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='float' />
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_set_ps1'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='float' />
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_set_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<parameter varname='e3' type='float'/>
+	<parameter varname='e2' type='float'/>
+	<parameter varname='e1' type='float'/>
+	<parameter varname='e0' type='float'/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_setr_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<parameter varname='e3' type='float'/>
+	<parameter varname='e2' type='float'/>
+	<parameter varname='e1' type='float'/>
+	<parameter varname='e0' type='float'/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e3
+dst[63:32] := e2
+dst[95:64] := e1
+dst[127:96] := e0
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m128' name='_mm_setzero_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Set</category>
+	<parameter varname='' type='void' />
+	<description>Return vector of type __m128 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='xorps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_loadh_pi'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='mem_addr' type='__m64 const*' />
+	<description>Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of "dst", and copy the lower 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := a[63:32]
+dst[95:64] := MEM[mem_addr+31:mem_addr]
+dst[127:96] := MEM[mem_addr+63:mem_addr+32]
+	</operation>
+	<instruction name='movhps' form='xmm, m64'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_loadl_pi'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='mem_addr' type='__m64 const*' />
+	<description>Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of "dst", and copy the upper 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[63:32] := MEM[mem_addr+63:mem_addr+32]
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction name='movlps' form='xmm, m64'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_load_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='float const*' />
+	<description>Load a single-precision (32-bit) floating-point element from memory into the lower of "dst", and zero the upper 3 elements. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[127:32] := 0
+	</operation>
+	<instruction name='movss' form='xmm, m32'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_load1_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='float const*' />
+	<description>Load a single-precision (32-bit) floating-point element from memory into all elements of "dst".</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[63:32] := MEM[mem_addr+31:mem_addr]
+dst[95:64] := MEM[mem_addr+31:mem_addr]
+dst[127:96] := MEM[mem_addr+31:mem_addr]
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_load_ps1'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='float const*' />
+	<description>Load a single-precision (32-bit) floating-point element from memory into all elements of "dst".</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[63:32] := MEM[mem_addr+31:mem_addr]
+dst[95:64] := MEM[mem_addr+31:mem_addr]
+dst[127:96] := MEM[mem_addr+31:mem_addr]
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_load_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='float const*' />
+	<description>Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name='movaps' form='xmm, m128'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_loadu_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='float const*' />
+	<description>Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name='movups' form='xmm, m128'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_loadr_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='float const*' />
+	<description>Load 4 single-precision (32-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+dst[127:96] := MEM[mem_addr+31:mem_addr]
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_stream_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float*' />
+	<parameter varname='a' type='__m128' />
+		<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name='movntps' form='m128, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storeh_pi'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__m64*' />
+	<parameter varname='a' type='__m128' />
+	<description>Store the upper 2 single-precision (32-bit) floating-point elements from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[95:64]
+MEM[mem_addr+63:mem_addr+32] := a[127:96]
+	</operation>
+	<instruction name='movhps' form='m64, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storel_pi'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__m64*' />
+	<parameter varname='a' type='__m128' />
+	<description>Store the lower 2 single-precision (32-bit) floating-point elements from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+MEM[mem_addr+63:mem_addr+32] := a[63:32]
+	</operation>
+	<instruction name='movlps' form='m64, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_store_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float*' />
+	<parameter varname='a' type='__m128' />
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+	</operation>
+	<instruction name='movss' form='m32, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='void' name='_mm_store1_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float*' />
+	<parameter varname='a' type='__m128' />
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+MEM[mem_addr+63:mem_addr+32] := a[31:0]
+MEM[mem_addr+95:mem_addr+64] := a[31:0]
+MEM[mem_addr+127:mem_addr+96] := a[31:0]
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='void' name='_mm_store_ps1'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float*' />
+	<parameter varname='a' type='__m128' />
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+MEM[mem_addr+63:mem_addr+32] := a[31:0]
+MEM[mem_addr+95:mem_addr+64] := a[31:0]
+MEM[mem_addr+127:mem_addr+96] := a[31:0]
+	</operation>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='void' name='_mm_store_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float*' />
+	<parameter varname='a' type='__m128' />
+	<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name='movaps' form='m128, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storeu_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float*' />
+	<parameter varname='a' type='__m128' />
+	<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name='movups' form='m128, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' sequence='true' rettype='void' name='_mm_storer_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float*' />
+	<parameter varname='a' type='__m128' />
+	<description>Store 4 single-precision (32-bit) floating-point elements from "a" into memory in reverse order.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[127:96]
+MEM[mem_addr+63:mem_addr+32] := a[95:64]
+MEM[mem_addr+95:mem_addr+64] := a[63:32]
+MEM[mem_addr+127:mem_addr+96] := a[31:0]
+	</operation>
+	<instruction name='movups' form='m128, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_move_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 3 elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := b[31:0]
+dst[63:32] := a[63:32]
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction name='movss' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_shuffle_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<parameter varname="imm8" type='unsigned int' />
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+	</operation>
+	<instruction name='shufps' form='xmm, xmm, imm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_unpackhi_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name='unpckhps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_unpacklo_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name='unpcklps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_movehl_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Move the upper 2 single-precision (32-bit) floating-point elements from "b" to the lower 2 elements of "dst", and copy the upper 2 elements from "a" to the upper 2 elements of "dst".</description>
+	<operation>
+dst[31:0] := b[95:64]
+dst[63:32] := b[127:96]
+dst[95:64] := a[95:64]
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction name='movhlps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_movelh_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m128' />
+	<parameter varname='b' type='__m128' />
+	<description>Move the lower 2 single-precision (32-bit) floating-point elements from "b" to the upper 2 elements of "dst", and copy the lower 2 elements from "a" to the lower 2 elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[63:32] := a[63:32]
+dst[95:64] := b[31:0]
+dst[127:96] := b[63:32]
+	</operation>
+	<instruction name='movlhps' form='xmm, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_movemask_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128' />
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF a[i+31]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:4] := 0
+	</operation>
+	<instruction name='movmskps' form='r32, xmm'/>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='void*' name='_mm_malloc'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<parameter varname='size' type='size_t'/>
+	<parameter varname='align' type='size_t'/>
+	<description>Allocate "size" bytes of memory, aligned to the alignment specified in "align", and return a pointer to the allocated memory. "_mm_free" should be used to free memory that is allocated with "_mm_malloc".</description>
+	<header>xmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='void' name='_mm_free'>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<description>Free aligned memory that was allocated with "_mm_malloc".</description>
+	<header>xmmintrin.h</header>
+</intrinsic>
+
+<intrinsic tech='SSE2' rettype='void' name='_mm_pause'>
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance and power consumption of spin-wait loops.</description>
+	<instruction name='pause' form=''/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='void' name='_mm_clflush'>
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<parameter varname='p' type='void const*'/>
+	<description>Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy.</description>
+	<instruction name='clflush' form='mprefetch'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='void' name='_mm_lfence'>
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order.</description>
+	<instruction name='lfence' form=''/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='void' name='_mm_mfence'>
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order.</description>
+	<instruction name='mfence' form=''/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_add_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name='paddb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_add_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name='paddw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_add_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='paddd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m64' name='_mm_add_si64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Add 64-bit integers "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] + b[63:0]
+	</operation>
+	<instruction name='paddq' form='mm, mm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_add_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='paddq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_adds_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddsb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_adds_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddsw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_adds_epu8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddusb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_adds_epu16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+	</operation>
+	<instruction name='paddusw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_avg_epu8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name='pavgb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_avg_epu16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+	</operation>
+	<instruction name='pavgw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_madd_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name='pmaddwd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_max_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF a[i+15:i] &gt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pmaxsw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_max_epu8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF a[i+7:i] &gt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pmaxub' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_min_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF a[i+15:i] &lt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pminsw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_min_epu8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF a[i+7:i] &lt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pminub' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_mulhi_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name='pmulhw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_mulhi_epu16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+	</operation>
+	<instruction name='pmulhuw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_mullo_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+	</operation>
+	<instruction name='pmullw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m64' name='_mm_mul_su32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Multiply the low unsigned 32-bit integers from "a" and "b", and store the unsigned 64-bit result in "dst". </description>
+	<operation>
+dst[63:0] := a[31:0] * b[31:0]
+	</operation>
+	<instruction name='pmuludq' form='mm, mm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_mul_epu32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='pmuludq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sad_epu8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+FOR j := 0 to 1
+	i := j*64
+	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] +
+	               tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
+	dst[i+63:i+16] := 0
+ENDFOR
+	</operation>
+	<instruction name='psadbw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sub_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+	</operation>
+	<instruction name='psubb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sub_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+	</operation>
+	<instruction name='psubw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sub_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='psubd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m64' name='_mm_sub_si64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Subtract 64-bit integer "b" from 64-bit integer "a", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - b[63:0]
+	</operation>
+	<instruction name='psubq' form='mm, mm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sub_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='psubq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_subs_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name='psubsb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_subs_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name='psubsw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_subs_epu8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+	</operation>
+	<instruction name='psubusb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_subs_epu16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+	</operation>
+	<instruction name='psubusw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_slli_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+	</operation>
+	<instruction name='pslldq' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_bslli_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+	</operation>
+	<instruction name='pslldq' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_bsrli_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+	</operation>
+	<instruction name='psrldq' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_slli_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psllw' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sll_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psllw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_slli_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pslld' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sll_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pslld' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_slli_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psllq' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sll_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psllq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srai_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := SignBit
+	ELSE
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psraw' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sra_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := SignBit
+	ELSE
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psraw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srai_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := SignBit
+	ELSE
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrad' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sra_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := SignBit
+	ELSE
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrad' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srli_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+	</operation>
+	<instruction name='psrldq' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srli_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrlw' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srl_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrlw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srli_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrld' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srl_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrld' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srli_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrlq' form='xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srl_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psrlq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_and_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := (a[127:0] AND b[127:0])
+	</operation>
+	<instruction name='pand' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_andnot_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compute the bitwise NOT of 128 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := ((NOT a[127:0]) AND b[127:0])
+	</operation>
+	<instruction name='pandn' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_or_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compute the bitwise OR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := (a[127:0] OR b[127:0])
+	</operation>
+	<instruction name='por' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_xor_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compute the bitwise XOR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[127:0] := (a[127:0] XOR b[127:0])
+	</operation>
+	<instruction name='pxor' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpeq_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpeqb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpeq_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpeqw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpeq_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpeqd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpgt_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpgt_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpgt_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmplt_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &lt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmplt_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &lt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmplt_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &lt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtepi32_pd'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='cvtdq2pd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtsi32_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='int'/>
+	<description>Convert the 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
+	<operation>
+dst[63:0] := Convert_Int32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='cvtsi2sd' form='xmm, r32'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtsi64_sd'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__int64'/>
+	<description>Convert the 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='cvtsi2sd' form='xmm, r64'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtsi64x_sd'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__int64'/>
+	<description>Convert the 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='cvtsi2sd' form='xmm, r64'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128' name='_mm_cvtepi32_ps'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='cvtdq2ps' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128d' name='_mm_cvtpi32_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m64'/>
+	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='cvtpi2pd' form='xmm, mm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvtsi32_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='int'/>
+	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+dst[127:32] := 0
+	</operation>
+	<instruction name='movd' form='xmm, r32'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvtsi64_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__int64'/>
+	<description>Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction name='movq' form='xmm, r64'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvtsi64x_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__int64'/>
+	<description>Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction name='movq' form='xmm, r64'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvtsi128_si32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name='movd' form='r32, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvtsi128_si64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name='movq' form='r64, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvtsi128_si64x'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name='movq' form='r64, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='e1' type='__m64'/>
+	<parameter varname='e0' type='__m64'/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set_epi64x'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='e1' type='__int64'/>
+	<parameter varname='e0' type='__int64'/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='e3' type='int'/>
+	<parameter varname='e2' type='int'/>
+	<parameter varname='e1' type='int'/>
+	<parameter varname='e0' type='int'/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='e7' type='short'/>
+	<parameter varname='e6' type='short'/>
+	<parameter varname='e5' type='short'/>
+	<parameter varname='e4' type='short'/>
+	<parameter varname='e3' type='short'/>
+	<parameter varname='e2' type='short'/>
+	<parameter varname='e1' type='short'/>
+	<parameter varname='e0' type='short'/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+dst[79:64] := e4
+dst[95:80] := e5
+dst[111:96] := e6
+dst[127:112] := e7
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='e15' type='char'/>
+	<parameter varname='e14' type='char'/>
+	<parameter varname='e13' type='char'/>
+	<parameter varname='e12' type='char'/>
+	<parameter varname='e11' type='char'/>
+	<parameter varname='e10' type='char'/>
+	<parameter varname='e9' type='char'/>
+	<parameter varname='e8' type='char'/>
+	<parameter varname='e7' type='char'/>
+	<parameter varname='e6' type='char'/>
+	<parameter varname='e5' type='char'/>
+	<parameter varname='e4' type='char'/>
+	<parameter varname='e3' type='char'/>
+	<parameter varname='e2' type='char'/>
+	<parameter varname='e1' type='char'/>
+	<parameter varname='e0' type='char'/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+dst[71:64] := e8
+dst[79:72] := e9
+dst[87:80] := e10
+dst[95:88] := e11
+dst[103:96] := e12
+dst[111:104] := e13
+dst[119:112] := e14
+dst[127:120] := e15
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set1_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='__m64'/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set1_epi64x'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='__int64'/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set1_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='int'/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastd".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set1_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='short'/>
+	<description>Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate "vpbroadcastw".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set1_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='char'/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastb".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_setr_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='e1' type='__m64'/>
+	<parameter varname='e0' type='__m64'/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e1
+dst[127:64] := e0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_setr_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='e3' type='int'/>
+	<parameter varname='e2' type='int'/>
+	<parameter varname='e1' type='int'/>
+	<parameter varname='e0' type='int'/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e3
+dst[63:32] := e2
+dst[95:64] := e1
+dst[127:96] := e0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_setr_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='e7' type='short'/>
+	<parameter varname='e6' type='short'/>
+	<parameter varname='e5' type='short'/>
+	<parameter varname='e4' type='short'/>
+	<parameter varname='e3' type='short'/>
+	<parameter varname='e2' type='short'/>
+	<parameter varname='e1' type='short'/>
+	<parameter varname='e0' type='short'/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[15:0] := e7
+dst[31:16] := e6
+dst[47:32] := e5
+dst[63:48] := e4
+dst[79:64] := e3
+dst[95:80] := e2
+dst[111:96] := e1
+dst[127:112] := e0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_setr_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='e15' type='char'/>
+	<parameter varname='e14' type='char'/>
+	<parameter varname='e13' type='char'/>
+	<parameter varname='e12' type='char'/>
+	<parameter varname='e11' type='char'/>
+	<parameter varname='e10' type='char'/>
+	<parameter varname='e9' type='char'/>
+	<parameter varname='e8' type='char'/>
+	<parameter varname='e7' type='char'/>
+	<parameter varname='e6' type='char'/>
+	<parameter varname='e5' type='char'/>
+	<parameter varname='e4' type='char'/>
+	<parameter varname='e3' type='char'/>
+	<parameter varname='e2' type='char'/>
+	<parameter varname='e1' type='char'/>
+	<parameter varname='e0' type='char'/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e15
+dst[15:8] := e14
+dst[23:16] := e13
+dst[31:24] := e12
+dst[39:32] := e11
+dst[47:40] := e10
+dst[55:48] := e9
+dst[63:56] := e8
+dst[71:64] := e7
+dst[79:72] := e6
+dst[87:80] := e5
+dst[95:88] := e4
+dst[103:96] := e3
+dst[111:104] := e2
+dst[119:112] := e1
+dst[127:120] := e0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128i' name='_mm_setzero_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<description>Return vector of type __m128i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='pxor' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128i' name='_mm_loadl_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='__m128i const*'/>
+	<description>Load 64-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='movq' form='xmm, m64'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_load_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='__m128i const*'/>
+	<description>Load 128-bits of integer data from memory into "dst". 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name='movdqa' form='xmm, m128'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_loadu_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='__m128i const*'/>
+	<description>Load 128-bits of integer data from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name='movdqu' form='xmm, m128'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='void' name='_mm_maskmoveu_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='mask' type='__m128i'/>
+	<parameter varname='mem_addr' type='char*'/>
+	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF mask[i+7]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='maskmovdqu' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_store_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__m128i*'/>
+	<parameter varname='a' type='__m128i'/>
+	<description>Store 128-bits of integer data from "a" into memory. 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name='movdqa' form='m128, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storeu_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__m128i*'/>
+	<parameter varname='a' type='__m128i'/>
+	<description>Store 128-bits of integer data from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name='movdqu' form='m128, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='void' name='_mm_storel_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__m128i*'/>
+	<parameter varname='a' type='__m128i'/>
+	<description>Store 64-bit integer from the first element of "a" into memory.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name='movq' form='m64, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_stream_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__m128i*'/>
+	<parameter varname='a' type='__m128i'/>
+	<description>Store 128-bits of integer data from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name='movntdq' form='m128, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='void' name='_mm_stream_si32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='int*'/>
+	<parameter varname='a' type='int'/>
+	<description>Store 32-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+	</operation>
+	<instruction name='movnti' form='m32, r32'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='void' name='_mm_stream_si64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__int64*'/>
+	<parameter varname='a' type='__int64'/>
+	<description>Store 64-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name='movnti' form='m64, r64'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m64' name='_mm_movepi64_pi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+	</operation>
+	<instruction name='movdq2q' form='mm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128i' name='_mm_movpi64_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m64'/>
+	<description>Copy the 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction name='movq2dq' form='xmm, mm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__m128i' name='_mm_move_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Copy the lower 64-bit integer in "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<instruction name='movq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_packs_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".
+	</description>
+	<operation>
+dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
+dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
+dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
+dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
+dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
+dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
+dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
+dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
+dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
+dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
+dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
+dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
+dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
+dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
+dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
+dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
+	</operation>
+	<instruction name='packsswb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_packs_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
+dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
+dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
+dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
+dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
+dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
+dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
+dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
+	</operation>
+	<instruction name='packssdw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_packus_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
+dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
+dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
+dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
+dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
+dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
+dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
+dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
+dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
+dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
+dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
+dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
+dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
+dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
+dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
+dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
+	</operation>
+	<instruction name='packuswb' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_extract_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[15:0] := (a[127:0] &gt;&gt; (imm8[2:0] * 16))[15:0]
+dst[31:16] := 0
+	</operation>
+	<instruction name='pextrw' form='r32, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_insert_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='i' type='int'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". </description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[2:0]*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<instruction name='pinsrw' form='xmm, r32, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_movemask_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name='pmovmskb' form='r32, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_shuffle_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+	</operation>
+	<instruction name='pshufd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_shufflehi_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
+dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
+dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
+dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
+	</operation>
+	<instruction name='pshufhw' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_shufflelo_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst".</description>
+	<operation>
+dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
+dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
+dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
+dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='pshuflw' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpackhi_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+	</operation>
+	<instruction name='punpckhbw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpackhi_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+
+dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name='punpckhwd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpackhi_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name='punpckhdq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpackhi_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name='punpckhqdq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpacklo_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+	</operation>
+	<instruction name='punpcklbw' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpacklo_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name='punpcklwd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpacklo_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name='punpckldq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpacklo_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name='punpcklqdq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_add_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
+	<operation>
+dst[63:0] := a[63:0] + b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='addsd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_add_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='addpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_div_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
+	<operation>
+dst[63:0] := a[63:0] 0 b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='divsd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_div_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='divpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_max_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := MAX(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='maxsd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_max_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name='maxpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_min_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := MIN(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='minsd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_min_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name='minpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_mul_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] * b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='mulsd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_mul_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='mulpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_sqrt_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := SQRT(b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='sqrtsd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_sqrt_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name='sqrtpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_sub_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='subsd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_sub_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='subpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_and_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name='andpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_andnot_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name='andnpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_or_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='orpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_xor_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='xorpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpeq_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] == b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmplt_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &lt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmple_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &lt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpgt_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &gt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpge_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] &gt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpord_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] != NaN AND b[63:0] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpunord_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] == NaN OR b[63:0] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpneq_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] != b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnlt_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := !(a[63:0] &lt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnle_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := !(a[63:0] &lt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpngt_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := !(a[63:0] &gt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnge_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := !(a[63:0] &gt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpeq_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] == b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmplt_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &lt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmple_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &lt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpgt_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &gt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpge_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] &gt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpord_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpunord_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpneq_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] != b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnlt_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := !(a[i+63:i] &lt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnle_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := !(a[i+63:i] &lt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpngt_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := !(a[i+63:i] &gt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnge_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := !(a[i+63:i] &gt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR	
+	</operation>
+	<instruction name='cmppd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comieq_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] == b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comilt_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] &lt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comile_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] &lt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comigt_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] &gt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comige_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] &gt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comineq_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1).</description>
+	<operation>
+RETURN ( a[63:0] != b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='comisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomieq_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] == b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomilt_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] &lt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomile_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] &lt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomigt_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] &gt; b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomige_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] &gt;= b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomineq_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
+	<operation>
+RETURN ( a[63:0] != b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='ucomisd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128' name='_mm_cvtpd_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction name='cvtpd2ps' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtps_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+	</operation>
+	<instruction name='cvtps2pd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvtpd_epi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction name='cvtpd2dq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvtsd_si32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction name='cvtsd2si' form='r32, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvtsd_si64'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction name='cvtsd2si' form='r64, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvtsd_si64x'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction name='cvtsd2si' form='r64, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128' name='_mm_cvtsd_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+dst[127:32] := a[127:31]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='cvtsd2ss' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='double' name='_mm_cvtsd_f64'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Copy the lower double-precision (64-bit) floating-point element of "a" to "dst".</description>
+	<operation>dst[63:0] := a[63:0]</operation>
+	<instruction name='movsd' form='m64, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtss_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='cvtss2sd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvttpd_epi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction name='cvttpd2dq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvttsd_si32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction name='cvttsd2si' form='r32, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvttsd_si64'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction name='cvttsd2si' form='r64, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvttsd_si64x'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction name='cvttsd2si' form='r64, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvtps_epi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='cvtps2dq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvttps_epi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='cvttps2dq' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m64' name='_mm_cvtpd_pi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction name='cvtpd2pi' form='mm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m64' name='_mm_cvttpd_pi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+	</operation>
+	<instruction name='cvttpd2pi' form='mm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_set_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='double'/>
+	<description>Copy double-precision (64-bit) floating-point element "a" to the lower element of "dst", and zero the upper element.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := 0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_set1_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='double'/>
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_set_pd1'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='double'/>
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_set_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='e1' type='double'/>
+	<parameter varname='e0' type='double'/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_setr_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='e1' type='double'/>
+	<parameter varname='e0' type='double'/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e1
+dst[127:64] := e0
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128d' name='_mm_setzero_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Set</category>
+	<parameter varname='' type='void'/>
+	<description>Return vector of type __m128d with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='xorpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_load_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='double const*'/>
+	<description>Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name='movapd' form='xmm, m128'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_load1_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='double const*'/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction name='movapd' form='xmm, m128'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+	
+	
+
+<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_load_pd1'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='double const*'/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction name='movapd' form='xmm, m128'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_loadr_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='double const*'/>
+	<description>Load 2 double-precision (64-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction name='movapd' form='xmm, m128'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_loadu_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='double const*'/>
+	<description>Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name='movupd' form='xmm, m128'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_load_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='double const*'/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower of "dst", and zero the upper element. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := 0
+	</operation>
+	<instruction name='movsd' form='xmm, m64'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_loadh_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='mem_addr' type='double const*'/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into the upper element of "dst", and copy the lower element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction name='movhpd' form='xmm, m64'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_loadl_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='mem_addr' type='double const*'/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst", and copy the upper element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='movlpd' form='xmm, m64'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_stream_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double*'/>
+	<parameter varname='a' type='__m128d'/>
+	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name='movntpd' form='m128, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_store_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double*'/>
+	<parameter varname='a' type='__m128d'/>
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name='movsd' form='m64, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='void' name='_mm_store1_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double*'/>
+	<parameter varname='a' type='__m128d'/>
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+MEM[mem_addr+127:mem_addr+64] := a[63:0]
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='void' name='_mm_store_pd1'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double*'/>
+	<parameter varname='a' type='__m128d'/>
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+MEM[mem_addr+127:mem_addr+64] := a[63:0]
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_store_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double*'/>
+	<parameter varname='a' type='__m128d'/>
+	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name='movapd' form='m128, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storeu_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double*'/>
+	<parameter varname='a' type='__m128d'/>
+	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+127:mem_addr] := a[127:0]
+	</operation>
+	<instruction name='movupd' form='m128, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' sequence='true' rettype='void' name='_mm_storer_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double*'/>
+	<parameter varname='a' type='__m128d'/>
+	<description>Store 2 double-precision (64-bit) floating-point elements from "a" into memory in reverse order.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[127:64]
+MEM[mem_addr+127:mem_addr+64] := a[63:0]
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storeh_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double*'/>
+	<parameter varname='a' type='__m128d'/>
+	<description>Store the upper double-precision (64-bit) floating-point element from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[127:64]
+	</operation>
+	<instruction name='movhpd' form='m64, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storel_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double*'/>
+	<parameter varname='a' type='__m128d'/>
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into memory.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name='movlpd' form='m64, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_unpackhi_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name='unpckhpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_unpacklo_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+	</operation>
+	<instruction name='unpcklpd' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_movemask_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF a[i+63]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:2] := 0
+	</operation>
+	<instruction name='movmskpd' form='r32, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_shuffle_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst". </description>
+	<operation>
+dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+	</operation>
+	<instruction name='shufpd' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_move_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := b[63:0]
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='movsd' form='xmm, xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128' name='_mm_castpd_ps'>
+	<type>Floating Point</type>
+		<CPUID>SSE2</CPUID>
+		<category>Cast</category>
+		<parameter varname='a' type='__m128d'/>
+		<description>Cast vector of type __m128d to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128i' name='_mm_castpd_si128'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+		<CPUID>SSE2</CPUID>
+		<category>Cast</category>
+		<parameter varname='a' type='__m128d'/>
+		<description>Cast vector of type __m128d to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128d' name='_mm_castps_pd'>
+	<type>Floating Point</type>
+		<CPUID>SSE2</CPUID>
+		<category>Cast</category>
+		<parameter varname='a' type='__m128'/>
+		<description>Cast vector of type __m128 to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128i' name='_mm_castps_si128'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+		<CPUID>SSE2</CPUID>
+		<category>Cast</category>
+		<parameter varname='a' type='__m128'/>
+		<description>Cast vector of type __m128 to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128d' name='_mm_castsi128_pd'>
+	<type>Floating Point</type>
+		<CPUID>SSE2</CPUID>
+		<category>Cast</category>
+		<parameter varname='a' type='__m128i'/>
+		<description>Cast vector of type __m128i to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128' name='_mm_castsi128_ps'>
+	<type>Floating Point</type>
+		<CPUID>SSE2</CPUID>
+		<category>Cast</category>
+		<parameter varname='a' type='__m128i'/>
+		<description>Cast vector of type __m128i to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' sequence='true' rettype='float' name='_cvtsh_ss'>
+	<type>Floating Point</type>
+	<category>Convert</category>
+	<parameter varname='a' type='unsigned short'/>
+	<description>Convert the half-precision (16-bit) floating-point value "a" to a single-precision (32-bit) floating-point value, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP16_To_FP32(a[15:0])
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' sequence='true' rettype='unsigned short' name='_cvtss_sh'>
+	<type>Floating Point</type>
+	<category>Convert</category>
+	<parameter varname='a' type='float'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Convert the single-precision (32-bit) floating-point value "a" to a half-precision (16-bit) floating-point value, and store the result in "dst".</description>
+	<operation>
+dst[15:0] := Convert_FP32_To_FP16(a[31:0])
+	</operation>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='__m128' name='_mm_cvtph_ps'>
+	<type>Floating Point</type>
+	<CPUID>FP16C</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtph2ps' form='xmm, xmm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='__m128i' name='_mm_cvtps_ph'>
+	<type>Floating Point</type>
+	<CPUID>FP16C</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='rounding' type='int'/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtps2ph' form='xmm, xmm, imm'/>
+	<header>emmintrin.h</header>
+</intrinsic>
+
+<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128' name='_mm_addsub_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF (j is even) 
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='addsubps' form='xmm, xmm'/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128d' name='_mm_addsub_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF (j is even) 
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='addsubpd' form='xmm, xmm'/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128d' name='_mm_hadd_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[127:64] + a[63:0]
+dst[127:64] := b[127:64] + b[63:0]
+	</operation>
+	<instruction name='haddpd' form='xmm, xmm'/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128' name='_mm_hadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+	</operation>
+	<instruction name='haddps' form='xmm, xmm'/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128d' name='_mm_hsub_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - a[127:64]
+dst[127:64] := b[63:0] - b[127:64]
+	</operation>
+	<instruction name='hsubpd' form='xmm, xmm'/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128' name='_mm_hsub_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+	</operation>
+	<instruction name='hsubps' form='xmm, xmm'/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128i' name='_mm_lddqu_si128'>
+	<type>Integer</type>
+	<CPUID>SSE3</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='__m128i const*'/>
+	<description>Load 128-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm_loadu_si128" when the data crosses a cache line boundary.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name='lddqu' form='xmm, m128'/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='void' name='_mm_monitor'>
+	<CPUID>MONITOR</CPUID>
+	<category>General Support</category>
+	<parameter varname='p' type='void const*'/>
+	<parameter varname='extensions' type='unsigned'/>
+	<parameter varname='hints' type='unsigned'/>
+	<description>Arm address monitoring hardware using the address specified in "p". A store to an address within the specified address range triggers the monitoring hardware. Specify optional extensions in "extensions", and optional hints in "hints".</description>
+	<instruction name='monitor' form=''/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128d' name='_mm_movedup_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst".
+	</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+	</operation>
+	<instruction name='movddup' form='xmm, xmm'/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128d' name='_mm_loaddup_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='double const*'/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".
+	</description>
+	<operation>
+tmp[63:0] := MEM[mem_addr+63:mem_addr]
+tmp[127:64] := MEM[mem_addr+63:mem_addr]
+	</operation>
+	<instruction name='movddup' form='xmm, m64'/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128' name='_mm_movehdup_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".
+	</description>
+	<operation>
+dst[31:0] := a[63:32] 
+dst[63:32] := a[63:32]
+dst[95:64] := a[127:96] 
+dst[127:96] := a[127:96]
+	</operation>
+	<instruction name='movshdup' form='xmm, xmm'/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128' name='_mm_moveldup_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE3</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".
+	</description>
+	<operation>
+dst[31:0] := a[31:0] 
+dst[63:32] := a[31:0]
+dst[95:64] := a[95:64] 
+dst[127:96] := a[95:64] 
+	</operation>
+	<instruction name='movsldup' form='xmm, xmm'/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='void' name='_mm_mwait'>
+	<CPUID>MONITOR</CPUID>
+	<category>General Support</category>
+	<parameter varname='extensions' type='unsigned'/>
+	<parameter varname='hints' type='unsigned'/>
+	<description>Hint to the processor that it can enter an implementation-dependent-optimized state while waiting for an event or store operation to the address range specified by MONITOR.</description>
+	<instruction name='mwait' form=''/>
+	<header>pmmintrin.h</header>
+</intrinsic>
+
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_abs_pi8'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m64'/>
+	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	dst[i+7:i] := ABS(a[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name='pabsb' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_abs_epi8'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := ABS(a[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name='pabsb' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_abs_pi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m64'/>
+	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := ABS(a[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name='pabsw' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_abs_epi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := ABS(a[i+15:i])
+ENDFOR
+	</operation>
+	<instruction name='pabsw' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_abs_pi32'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m64'/>
+	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='pabsd' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_abs_epi32'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='pabsd' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_shuffle_epi8'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[3:0] := b[i+3:i]
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pshufb' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_shuffle_pi8'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[2:0] := b[i+2:i]
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pshufb' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_alignr_epi8'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname='count' type='int'/>
+	<description>Concatenate 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst". </description>
+	<operation>
+tmp[255:0] := ((a[127:0] &lt;&lt; 128) OR b[127:0]) &gt;&gt; (count[7:0]*8)
+dst[127:0] := tmp[127:0]
+	</operation>
+	<instruction name='palignr' form='xmm, xmm, imm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_alignr_pi8'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<parameter varname='count' type='int'/>
+	<description>Concatenate 8-byte blocks in "a" and "b" into a 16-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst". </description>
+	<operation>
+tmp[127:0] := ((a[63:0] &lt;&lt; 64) OR b[63:0]) &gt;&gt; (count[7:0]*8)
+dst[63:0] := tmp[63:0]
+	</operation>
+	<instruction name='palignr' form='mm, mm, imm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hadd_epi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[31:16] + a[15:0]
+dst[31:16] := a[63:48] + a[47:32]
+dst[47:32] := a[95:80] + a[79:64]
+dst[63:48] := a[127:112] + a[111:96]
+dst[79:64] := b[31:16] + b[15:0]
+dst[95:80] := b[63:48] + b[47:32]
+dst[111:96] := b[95:80] + b[79:64]
+dst[127:112] := b[127:112] + b[111:96]
+	</operation>
+	<instruction name='phaddw' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hadds_epi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0])
+dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32])
+dst[47:32] = Saturate_To_Int16(a[95:80] + a[79:64])
+dst[63:48] = Saturate_To_Int16(a[127:112] + a[111:96])
+dst[79:64] = Saturate_To_Int16(b[31:16] + b[15:0])
+dst[95:80] = Saturate_To_Int16(b[63:48] + b[47:32])
+dst[111:96] = Saturate_To_Int16(b[95:80] + b[79:64])
+dst[127:112] = Saturate_To_Int16(b[127:112] + b[111:96])
+	</operation>
+	<instruction name='phaddsw' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hadd_epi32'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+	</operation>
+	<instruction name='phaddd' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hadd_pi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[31:16] + a[15:0]
+dst[31:16] := a[63:48] + a[47:32]
+dst[47:32] := b[31:16] + b[15:0]
+dst[63:48] := b[63:48] + b[47:32]
+	</operation>
+	<instruction name='phaddw' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hadd_pi32'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := b[63:32] + b[31:0]
+	</operation>
+	<instruction name='phaddw' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hadds_pi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0])
+dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32])
+dst[47:32] = Saturate_To_Int16(b[31:16] + b[15:0])
+dst[63:48] = Saturate_To_Int16(b[63:48] + b[47:32])
+	</operation>
+	<instruction name='phaddsw' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hsub_epi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[15:0] - a[31:16]
+dst[31:16] := a[47:32] - a[63:48]
+dst[47:32] := a[79:64] - a[95:80]
+dst[63:48] := a[111:96] - a[127:112]
+dst[79:64] := b[15:0] - b[31:16]
+dst[95:80] := b[47:32] - b[63:48]
+dst[111:96] := b[79:64] - b[95:80]
+dst[127:112] := b[111:96] - b[127:112]
+	</operation>
+	<instruction name='phsubw' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hsubs_epi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16])
+dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48])
+dst[47:32] = Saturate_To_Int16(a[79:64] - a[95:80])
+dst[63:48] = Saturate_To_Int16(a[111:96] - a[127:112])
+dst[79:64] = Saturate_To_Int16(b[15:0] - b[31:16])
+dst[95:80] = Saturate_To_Int16(b[47:32] - b[63:48])
+dst[111:96] = Saturate_To_Int16(b[79:64] - b[95:80])
+dst[127:112] = Saturate_To_Int16(b[111:96] - b[127:112])
+	</operation>
+	<instruction name='phsubsw' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hsub_epi32'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+	</operation>
+	<instruction name='phsubd' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hsub_pi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[15:0] - a[31:16]
+dst[31:16] := a[47:32] - a[63:48]
+dst[47:32] := b[15:0] - b[31:16]
+dst[63:48] := b[47:32] - b[63:48]
+	</operation>
+	<instruction name='phsubw' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hsub_pi32'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := b[31:0] - b[63:32]
+	</operation>
+	<instruction name='phsubd' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hsubs_pi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16])
+dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48])
+dst[47:32] = Saturate_To_Int16(b[15:0] - b[31:16])
+dst[63:48] = Saturate_To_Int16(b[47:32] - b[63:48])
+	</operation>
+	<instruction name='phsubsw' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_maddubs_epi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name='pmaddubsw' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_maddubs_pi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+	</operation>
+	<instruction name='pmaddubsw' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_mulhrs_epi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+	</operation>
+	<instruction name='pmulhrsw' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_mulhrs_pi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+	</operation>
+	<instruction name='pmulhrsw' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_sign_epi8'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF b[i+7:i] &lt; 0
+		dst[i+7:i] := NEG(a[i+7:i])
+	ELSE IF b[i+7:i] = 0
+		dst[i+7:i] := 0
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psignb' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_sign_epi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF b[i+15:i] &lt; 0
+		dst[i+15:i] := NEG(a[i+15:i])
+	ELSE IF b[i+15:i] = 0
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psignw' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_sign_epi32'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF b[i+31:i] &lt; 0
+		dst[i+31:i] := NEG(a[i+31:i])
+	ELSE IF b[i+31:i] = 0
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psignd' form='xmm, xmm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_sign_pi8'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	IF b[i+7:i] &lt; 0
+		dst[i+7:i] := NEG(a[i+7:i])
+	ELSE IF b[i+7:i] = 0
+		dst[i+7:i] := 0
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psignb' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_sign_pi16'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*16
+	IF b[i+15:i] &lt; 0
+		dst[i+15:i] := NEG(a[i+15:i])
+	ELSE IF b[i+15:i] = 0
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psignw' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSSE3' rettype='__m64' name='_mm_sign_pi32'>
+	<type>Integer</type>
+	<CPUID>SSSE3</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m64'/>
+	<parameter varname='b' type='__m64'/>
+	<description>Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	IF b[i+31:i] &lt; 0
+		dst[i+31:i] := NEG(a[i+31:i])
+	ELSE IF b[i+31:i] = 0
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='psignd' form='mm, mm'/>
+	<header>tmmintrin.h</header>
+</intrinsic>
+
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_blend_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[j%8]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='blendpd' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_blend_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[j%8]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='blendps' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_blendv_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='mask' type='__m128d'/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='blendvpd' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_blendv_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='mask' type='__m128'/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='blendvps' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_blendv_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname='mask' type='__m128i'/>
+	<description>Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF mask[i+7]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pblendvb' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_blend_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF imm8[j%8]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pblendw' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_dp_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Conditionally multiply the packed double-precision (64-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
+	<operation>
+DP(a[127:0], b[127:0], imm8[7:0]) {
+	FOR j := 0 to 1
+		i := j*64
+		IF imm8[(4+j)%8]]
+			temp[i+63:i] := a[i+63:i] * b[i+63:i]
+		ELSE
+			temp[i+63:i] := 0
+		FI
+	ENDFOR
+	
+	sum[63:0] := temp[127:64] + temp[63:0]
+	
+	FOR j := 0 to 1
+		i := j*64
+		IF imm8[j%8]
+			tmpdst[i+63:i] := sum[63:0]
+		ELSE
+			tmpdst[i+63:i] := 0
+		FI
+	ENDFOR
+	RETURN tmpdst[127:0]
+}
+
+dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
+	</operation>
+	<instruction name='dppd' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_dp_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
+	<operation>
+DP(a[127:0], b[127:0], imm8[7:0]) {
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[(4+j)%8]
+			temp[i+31:i] := a[i+31:i] * b[i+31:i]
+		ELSE
+			temp[i+31:i] := 0
+		FI
+	ENDFOR
+	
+	sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])
+	
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[j%8]
+			tmpdst[i+31:i] := sum[31:0]
+		ELSE
+			tmpdst[i+31:i] := 0
+		FI
+	ENDFOR
+	RETURN tmpdst[127:0]
+}
+
+dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
+	</operation>
+	<instruction name='dpps' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_extract_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+dst[31:0] := (a[127:0] &gt;&gt; (imm8[1:0] * 32))[31:0]
+	</operation>
+	<instruction name='extractps' form='r32, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_extract_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Extract an 8-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
+	<operation>
+dst[7:0] := (a[127:0] &gt;&gt; (imm8[3:0] * 8))[7:0]
+dst[31:8] := 0
+	</operation>
+	<instruction name='pextrb' form='r32, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_extract_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Extract a 32-bit integer from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+dst[31:0] := (a[127:0] &gt;&gt; (imm8[1:0] * 32))[31:0]
+	</operation>
+	<instruction name='pextrd' form='r32, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_extract_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Extract a 64-bit integer from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[127:0] &gt;&gt; (imm8[0] * 64))[63:0]
+	</operation>
+	<instruction name='pextrq' form='r64, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_insert_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Copy "a" to "tmp", then insert a single-precision (32-bit) floating-point element from "b" into "tmp" using the control in "imm8". Store "tmp" to "dst" using the mask in "imm8" (elements are zeroed out when the corresponding bit is set). </description>
+	<operation>
+tmp2[127:0] := a[127:0]
+CASE (imm8[7:6]) of
+0: tmp1[31:0] := b[31:0]
+1: tmp1[31:0] := b[63:32]
+2: tmp1[31:0] := b[95:64]
+3: tmp1[31:0] := b[127:96]
+ESAC
+CASE (imm8[5:4]) of
+0: tmp2[31:0] := tmp1[31:0]
+1: tmp2[63:32] := tmp1[31:0]
+2: tmp2[95:64] := tmp1[31:0]
+3: tmp2[127:96] := tmp1[31:0]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[j%8]
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := tmp2[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='insertps' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_insert_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='i' type='int'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Copy "a" to "dst", and insert the lower 8-bit integer from "i" into "dst" at the location specified by "imm8". </description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[3:0]*8
+dst[sel+7:sel] := i[7:0]
+	</operation>
+	<instruction name='pinsrb' form='xmm, r32, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_insert_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='i' type='int'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "imm8". </description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[1:0]*32
+dst[sel+31:sel] := i[31:0]
+	</operation>
+	<instruction name='pinsrd' form='xmm, r32, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_insert_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='i' type='__int64'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "imm8". </description>
+	<operation>
+dst[127:0] := a[127:0]
+sel := imm8[0]*64
+dst[sel+63:sel] := i[63:0]
+	</operation>
+	<instruction name='pinsrq' form='xmm, r64, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_max_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF a[i+7:i] &gt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pmaxsb' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_max_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF a[i+31:i] &gt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pmaxsd' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_max_epu32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF a[i+31:i] &gt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pmaxud' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_max_epu16'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF a[i+15:i] &gt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pmaxuw' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_min_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF a[i+7:i] &lt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pminsb' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_min_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF a[i+31:i] &lt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pminsd' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_min_epu32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF a[i+31:i] &lt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pminud' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_min_epu16'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF a[i+15:i] &lt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='pminuw' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_packus_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
+dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
+dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
+dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
+dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
+dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
+dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
+dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
+	</operation>
+	<instruction name='packusdw' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cmpeq_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpeqq' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi8_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	dst[l+15:l] := SignExtend(a[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name='pmovsxbw' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi8_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := SignExtend(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction name='pmovsxbd' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi8_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := SignExtend(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction name='pmovsxbq' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi16_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := SignExtend(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction name='pmovsxwd' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi16_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := SignExtend(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction name='pmovsxwq' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi32_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := SignExtend(a[k+31:k])
+ENDFOR
+	</operation>
+	<instruction name='pmovsxdq' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu8_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	dst[l+15:l] := ZeroExtend(a[i+7:i])
+ENDFOR
+	</operation>
+	<instruction name='pmovzxbw' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu8_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := ZeroExtend(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction name='pmovzxbd' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu8_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := ZeroExtend(a[k+7:k])
+ENDFOR
+	</operation>
+	<instruction name='pmovzxbq' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu16_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := ZeroExtend(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction name='pmovzxwd' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu16_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := ZeroExtend(a[k+15:k])
+ENDFOR
+	</operation>
+	<instruction name='pmovzxwq' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu32_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := ZeroExtend(a[k+31:k])
+ENDFOR
+	</operation>
+	<instruction name='pmovzxdq' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_mul_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='pmuldq' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_mullo_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	tmp[63:0] := a[i+31:i] * b[i+31:i]
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+	</operation>
+	<instruction name='pmulld' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_testz_si128'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+IF (a[127:0] AND b[127:0] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF ((NOT a[127:0]) AND b[127:0] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN ZF
+	</operation>
+	<instruction name='ptest' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_testc_si128'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+IF (a[127:0] AND b[127:0] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF ((NOT a[127:0]) AND b[127:0] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN CF
+	</operation>
+	<instruction name='ptest' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_testnzc_si128'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+IF (a[127:0] AND b[127:0] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF ((NOT a[127:0]) AND b[127:0] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	RETURN 1
+ELSE
+	RETURN 0
+FI
+	</operation>
+	<instruction name='ptest' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_test_all_zeros'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='mask' type='__m128i'/>
+	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and return 1 if the result is zero, otherwise return 0.</description>
+	<operation>
+IF (a[127:0] AND mask[127:0] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+RETURN ZF
+	</operation>
+	<instruction name='ptest' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_test_mix_ones_zeros'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='mask' type='__m128i'/>
+	 <description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "mask", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+IF (a[127:0] AND mask[127:0] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF ((NOT a[127:0]) AND mask[127:0] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	RETURN 1
+ELSE
+	RETURN 0
+FI
+	</operation>
+	<instruction name='ptest' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' sequence='true' rettype='int' name='_mm_test_all_ones'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Compute the bitwise NOT of "a" and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0.</description>
+	<operation>
+FOR j := 0 to 127
+	tmp[i] := 1
+ENDFOR
+
+IF ((NOT a[127:0]) AND tmp[127:0] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN CF
+	</operation>
+	<instruction name='pcmpeqd' form='xmm, xmm'/>
+	<instruction name='ptest' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_round_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='rounding' type='int'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name='roundpd' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_floor_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name='roundpd' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_ceil_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+	</operation>
+	<instruction name='roundpd' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_round_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='rounding' type='int'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='roundps' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_floor_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='roundps' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_ceil_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+	</operation>
+	<instruction name='roundps' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_round_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='rounding' type='int'/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" using the "rounding" parameter, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := ROUND(b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='roundsd' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_floor_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" down to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := FLOOR(b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='roundsd' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_ceil_sd'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "b" up to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := CEIL(b[63:0])
+dst[127:64] := a[127:64]
+	</operation>
+	<instruction name='roundsd' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_round_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='rounding' type='int'/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" using the "rounding" parameter, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := ROUND(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='roundss' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_floor_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" down to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := FLOOR(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='roundss' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_ceil_ss'>
+	<type>Floating Point</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "b" up to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := CEIL(b[31:0])
+dst[127:32] := a[127:32]
+	</operation>
+	<instruction name='roundss' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_minpos_epu16'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Horizontally compute the minimum amongst the packed unsigned 16-bit integers in "a", store the minimum and index in "dst", and zero the remaining bits in "dst".</description>
+	<operation>
+index[2:0] := 0
+min[15:0] := a[15:0]
+FOR j := 0 to 7
+	i := j*16
+	IF a[i+15:i] &lt; min[15:0]
+		index[2:0] := j
+		min[15:0] := a[i+15:i]
+	FI
+ENDFOR
+dst[15:0] := min[15:0]
+dst[18:16] := index[2:0]
+dst[127:19] := 0
+	</operation>
+	<instruction name='phminposuw' form='xmm, xmm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_mpsadbw_epu8'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Arithmetic</category>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Eight SADs are performed using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8".</description>
+	<operation>
+MPSADBW(a[127:0], b[127:0], imm8[2:0]) {
+	a_offset := imm8[2]*32
+	b_offset := imm8[1:0]*32
+	FOR j := 0 to 7
+		i := j*8
+		k := a_offset+i
+		l := b_offset
+		tmp[i*2+15:i*2] := ABS(a[k+7:k] - b[l+7:l]) + ABS(a[k+15:k+8] - b[l+15:l+8]) + ABS(a[k+23:k+16] - b[l+23:l+16]) + ABS(a[k+31:k+24] - b[l+31:l+24])
+	ENDFOR
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])
+	</operation>
+	<instruction name='mpsadbw' form='xmm, xmm, imm'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_stream_load_si128'>
+	<type>Integer</type>
+	<CPUID>SSE4.1</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='const __m128i*'/>
+	<description>Load 128-bits of integer data from memory into "dst" using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[127:0] := MEM[mem_addr+127:mem_addr]
+	</operation>
+	<instruction name='movntdqa' form='xmm, m128'/>
+	<header>smmintrin.h</header>
+</intrinsic>
+
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__m128i' name='_mm_cmpistrm'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated mask in "dst".
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 1
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound, j += 2
+			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes[i][i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
+			k++
+		ENDFOR
+	ENDFOR
+ESAC
+
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+
+// output
+IF imm8[6] // byte / word mask
+	FOR i := 0 to UpperBound
+		j := i*size
+		IF IntRes2[i]
+			dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF)
+		ELSE
+			dst[j+size-1:j] := 0
+		FI
+	ENDFOR
+ELSE // bit mask
+	dst[UpperBound:0] := IntRes[UpperBound:0]
+	dst[127:UpperBound+1] := 0
+FI
+	</operation>
+	<instruction name='pcmpistrm' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistri'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated index in "dst".
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 1
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound, j += 2
+			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes[i][i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
+			k++
+		ENDFOR
+	ENDFOR
+ESAC
+
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+
+// output
+IF imm8[6] // most significant bit
+	tmp := UpperBound
+	dst := tmp
+	DO WHILE ((tmp &gt;= 0) AND a[tmp] = 0)
+		tmp := tmp - 1
+		dst := tmp
+	OD
+ELSE // least significant bit
+	tmp := 0
+	dst := tmp
+	DO WHILE ((tmp &lt;= UpperBound) AND a[tmp] = 0)
+		tmp := tmp + 1
+		dst := tmp
+	OD
+FI
+	</operation>
+	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistrz'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise.
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+bInvalid := 0
+FOR j := 0 to UpperBound
+	n := j*size
+	IF b[n+size-1:n] == 0
+		bInvalid := 1
+	FI
+ENDFOR
+
+dst := bInvalid
+	</operation>
+	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistrc'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 1
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound, j += 2
+			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes[i][i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
+			k++
+		ENDFOR
+	ENDFOR
+ESAC
+
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+
+// output
+dst := (IntRes2 != 0)
+	</operation>
+	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistrs'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise.
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+aInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	IF b[m+size-1:m] == 0
+		aInvalid := 1
+	FI
+ENDFOR
+
+dst := aInvalid
+	</operation>
+	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistro'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns bit 0 of the resulting bit mask.
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 1
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound, j += 2
+			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes[i][i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
+			k++
+		ENDFOR
+	ENDFOR
+ESAC
+
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+
+// output
+dst := IntRes2[0]
+	</operation>
+	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistra'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise.
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
+		
+		// invalidate characters after EOS
+		IF a[m+size-1:m] == 0
+			aInvalid := 1
+		FI
+		IF b[n+size-1:n] == 0
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 1
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound, j += 2
+			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes[i][i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
+			k++
+		ENDFOR
+	ENDFOR
+ESAC
+
+// optionally negate results
+bInvalid := 0
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF b[n+size-1:n] == 0
+				bInvalid := 1
+			FI
+			IF bInvalid // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+
+// output
+dst := (IntRes2 == 0) AND bInvalid
+	</operation>
+	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__m128i' name='_mm_cmpestrm'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='la' type='int'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname='lb' type='int'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated mask in "dst".
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 1
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound, j += 2
+			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes[i][i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
+			k++
+		ENDFOR
+	ENDFOR
+ESAC
+
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+
+// output
+IF imm8[6] // byte / word mask
+	FOR i := 0 to UpperBound
+		j := i*size
+		IF IntRes2[i]
+			dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF)
+		ELSE
+			dst[j+size-1:j] := 0
+		FI
+	ENDFOR
+ELSE // bit mask
+	dst[UpperBound:0] := IntRes[UpperBound:0]
+	dst[127:UpperBound+1] := 0
+FI
+	</operation>
+	<instruction name='pcmpestrm' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestri'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='la' type='int'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname='lb' type='int'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated index in "dst".
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+		0:  // equal any
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		1:  // ranges
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			FI
+		2:  // equal each
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 0
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		3:  // equal ordered
+			IF (!aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 0
+			ELSE IF (aInvalid &amp;&amp; !bInvalid)
+				BoolRes[i][j] := 1
+			ELSE If (aInvalid &amp;&amp; bInvalid)
+				BoolRes[i][j] := 1
+			FI
+		ESAC
+	ENDFOR
+ENDFOR
+
+// aggregate results
+CASE (imm8[3:2]) OF
+0:  // equal any
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound
+			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
+		ENDFOR
+	ENDFOR
+1:  // ranges
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		FOR j := 0 to UpperBound, j += 2
+			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
+		ENDFOR
+	ENDFOR
+2:  // equal each
+	IntRes1 := 0
+	FOR i := 0 to UpperBound
+		IntRes1[i] := BoolRes[i][i]
+	ENDFOR
+3:  // equal ordered
+	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+	FOR i := 0 to UpperBound
+		k := i
+		FOR j := 0 to UpperBound-i
+			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
+			k++
+		ENDFOR
+	ENDFOR
+ESAC
+
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+
+// output
+IF imm8[6] // most significant bit
+	tmp := UpperBound
+	dst := tmp
+	DO WHILE ((tmp &gt;= 0) AND a[tmp] = 0)
+		tmp := tmp - 1
+		dst := tmp
+	OD
+ELSE // least significant bit
+	tmp := 0
+	dst := tmp
+	DO WHILE ((tmp &lt;= UpperBound) AND a[tmp] = 0)
+		tmp := tmp + 1
+		dst := tmp
+	OD
+FI
+	</operation>
+	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestrz'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='la' type='int'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname='lb' type='int'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise.
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+dst := (lb &lt;= UpperBound)
+	</operation>
+	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestrc'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='la' type='int'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname='lb' type='int'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+			0:  // equal any
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 0
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				FI
+			1:  // ranges
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 0
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				FI
+			2:  // equal each
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 0
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 1
+				FI
+			3:  // equal ordered
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 1
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 1
+				FI
+		ESAC
+	ENDFOR
+ENDFOR
+
+// aggregate results
+CASE (imm8[3:2]) OF
+	0:  // equal any
+		IntRes1 := 0
+		FOR i := 0 to UpperBound
+			FOR j := 0 to UpperBound
+				IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
+			ENDFOR
+		ENDFOR
+	1:  // ranges
+		IntRes1 := 0
+		FOR i := 0 to UpperBound
+			FOR j := 0 to UpperBound, j += 2
+				IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
+			ENDFOR
+		ENDFOR
+	2:  // equal each
+		IntRes1 := 0
+		FOR i := 0 to UpperBound
+			IntRes1[i] := BoolRes[i][i]
+		ENDFOR
+	3:  // equal ordered
+		IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+		FOR i := 0 to UpperBound
+			k := i
+			FOR j := 0 to UpperBound-i
+				IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
+				k++
+			ENDFOR
+		ENDFOR
+ESAC
+
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+
+// output
+dst := (IntRes2 != 0)
+	</operation>
+	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestrs'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='la' type='int'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname='lb' type='int'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise.
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+dst := (la &lt;= UpperBound)
+	</operation>
+	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestro'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='la' type='int'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname='lb' type='int'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns bit 0 of the resulting bit mask.
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+			0:  // equal any
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 0
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				FI
+			1:  // ranges
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 0
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				FI
+			2:  // equal each
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 0
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 1
+				FI
+			3:  // equal ordered
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 1
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 1
+				FI
+		ESAC
+	ENDFOR
+ENDFOR
+
+// aggregate results
+CASE (imm8[3:2]) OF
+	0:  // equal any
+		IntRes1 := 0
+		FOR i := 0 to UpperBound
+			FOR j := 0 to UpperBound
+				IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
+			ENDFOR
+		ENDFOR
+	1:  // ranges
+		IntRes1 := 0
+		FOR i := 0 to UpperBound
+			FOR j := 0 to UpperBound, j += 2
+				IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
+			ENDFOR
+		ENDFOR
+	2:  // equal each
+		IntRes1 := 0
+		FOR i := 0 to UpperBound
+			IntRes1[i] := BoolRes[i][i]
+		ENDFOR
+	3:  // equal ordered
+		IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+		FOR i := 0 to UpperBound
+			k := i
+			FOR j := 0 to UpperBound-i
+				IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
+				k++
+			ENDFOR
+		ENDFOR
+ESAC
+
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+
+// output
+dst := IntRes2[0
+	</operation>
+	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestra'>
+	<CPUID>SSE4.2</CPUID>
+	<category>String Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='la' type='int'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname='lb' type='int'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise.
+	[strcmp_note]
+	</description>
+	<operation>
+size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
+UpperBound := (128 / size) - 1
+
+// compare all characters
+aInvalid := 0
+bInvalid := 0
+FOR i := 0 to UpperBound
+	m := i*size
+	FOR j := 0 to UpperBound
+		n := j*size
+		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
+		
+		// invalidate characters after EOS
+		IF i == la
+			aInvalid := 1
+		FI
+		IF j == lb
+			bInvalid := 1
+		FI
+		
+		// override comparisons for invalid characters
+		CASE (imm8[3:2]) OF
+			0:  // equal any
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 0
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				FI
+			1:  // ranges
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 0
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				FI
+			2:  // equal each
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 0
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 1
+				FI
+			3:  // equal ordered
+				IF (!aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 0
+				ELSE IF (aInvalid &amp;&amp; !bInvalid)
+					BoolRes[i][j] := 1
+				ELSE If (aInvalid &amp;&amp; bInvalid)
+					BoolRes[i][j] := 1
+				FI
+		ESAC
+	ENDFOR
+ENDFOR
+
+// aggregate results
+CASE (imm8[3:2]) OF
+	0:  // equal any
+		IntRes1 := 0
+		FOR i := 0 to UpperBound
+			FOR j := 0 to UpperBound
+				IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
+			ENDFOR
+		ENDFOR
+	1:  // ranges
+		IntRes1 := 0
+		FOR i := 0 to UpperBound
+			FOR j := 0 to UpperBound, j += 2
+				IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
+			ENDFOR
+		ENDFOR
+	2:  // equal each
+		IntRes1 := 0
+		FOR i := 0 to UpperBound
+			IntRes1[i] := BoolRes[i][i]
+		ENDFOR
+	3:  // equal ordered
+		IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
+		FOR i := 0 to UpperBound
+			k := i
+			FOR j := 0 to UpperBound-i
+				IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
+				k++
+			ENDFOR
+		ENDFOR
+ESAC
+
+// optionally negate results
+FOR i := 0 to UpperBound
+	IF imm8[4]
+		IF imm8[5] // only negate valid
+			IF i &gt;= lb // invalid, don't negate
+				IntRes2[i] := IntRes1[i]
+			ELSE // valid, negate
+				IntRes2[i] := -1 XOR IntRes1[i]
+			FI
+		ELSE // negate all
+			IntRes2[i] := -1 XOR IntRes1[i]
+		FI
+	ELSE // don't negate
+		IntRes2[i] := IntRes1[i]
+	FI
+ENDFOR
+
+// output
+dst := (IntRes2 == 0) AND (lb &gt; UpperBound)
+	</operation>
+	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpgt_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] &gt; b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+	</operation>
+	<instruction name='pcmpgtq' form='xmm, xmm'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' rettype='unsigned int' name='_mm_crc32_u8'>
+	<type>Integer</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>Cryptography</category>
+	<parameter varname='crc' type='unsigned int'/>
+	<parameter varname='v' type='unsigned char'/>
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 8-bit integer "v", and stores the result in "dst".</description>
+	<operation>
+tmp1[7:0] := v[0:7] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[39:0] := tmp1[7:0] &lt;&lt; 32 
+tmp4[39:0] := tmp2[31:0] &lt;&lt; 8
+tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0]
+tmp6[31:0] := tmp5[39:0] MOD2 0x11EDC6F41
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction name='crc32' form='r32, r8'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' rettype='unsigned int' name='_mm_crc32_u16'>
+	<type>Integer</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>Cryptography</category>
+	<parameter varname='crc' type='unsigned int'/>
+	<parameter varname='v' type='unsigned short'/>
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 16-bit integer "v", and stores the result in "dst".</description>
+	<operation>
+tmp1[15:0] := v[0:15] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[47:0] := tmp1[15:0] &lt;&lt; 32
+tmp4[47:0] := tmp2[31:0] &lt;&lt; 16
+tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0]
+tmp6[31:0] := tmp5[47:0] MOD2 0x11EDC6F41
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction name='crc32' form='r32, r16'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' rettype='unsigned int' name='_mm_crc32_u32'>
+	<type>Integer</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>Cryptography</category>
+	<parameter varname='crc' type='unsigned int'/>
+	<parameter varname='v' type='unsigned int'/>
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 32-bit integer "v", and stores the result in "dst".</description>
+	<operation>
+tmp1[31:0] := v[0:31] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[63:0] := tmp1[31:0] &lt;&lt; 32
+tmp4[63:0] := tmp2[31:0] &lt;&lt; 32
+tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0]
+tmp6[31:0] := tmp5[63:0] MOD2 0x11EDC6F41
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction name='crc32' form='r32, r32'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE4.2' rettype='unsigned __int64' name='_mm_crc32_u64'>
+	<type>Integer</type>
+	<CPUID>SSE4.2</CPUID>
+	<category>Cryptography</category>
+	<parameter varname='crc' type='unsigned __int64'/>
+	<parameter varname='v' type='unsigned __int64'/>
+	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 64-bit integer "v", and stores the result in "dst".</description>
+	<operation>
+tmp1[63:0] := v[0:63] // bit reflection
+tmp2[31:0] := crc[0:31] // bit reflection
+tmp3[95:0] := tmp1[31:0] &lt;&lt; 32
+tmp4[95:0] := tmp2[63:0] &lt;&lt; 64
+tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0]
+tmp6[31:0] := tmp5[95:0] MOD2 0x11EDC6F41
+dst[31:0] := tmp6[0:31] // bit reflection
+	</operation>
+	<instruction name='crc32' form='r64, r64'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='int' name='_mm_popcnt_u32'>
+	<type>Integer</type>
+	<CPUID>POPCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='unsigned int'/>
+	<description>
+		Count the number of bits set to 1 in unsigned 32-bit integer "a", and return that count in "dst". 
+	</description>
+	<operation>
+dst := 0
+FOR i := 0 to 31
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name='popcnt' form='r32, r32'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='__int64' name='_mm_popcnt_u64'>
+	<type>Integer</type>
+	<CPUID>POPCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='unsigned __int64'/>
+	<description>
+		Count the number of bits set to 1 in unsigned 64-bit integer "a", and return that count in "dst". 
+	</description>
+	<operation>
+dst := 0
+FOR i := 0 to 63
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name='popcnt' form='r64, r64'/>
+	<header>nmmintrin.h</header>
+</intrinsic>
+
+<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aesenc_si128'>
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='RoundKey' type='__m128i'/>
+	<description>Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
+	<operation>state := a
+a[127:0] := ShiftRows(a[127:0])
+a[127:0] := SubBytes(a[127:0])
+a[127:0] := MixColumns(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction name='aesenc' form='xmm, xmm'/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aesenclast_si128'>
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='RoundKey' type='__m128i'/>
+	<description>Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
+	<operation>state := a
+a[127:0] := ShiftRows(a[127:0])
+a[127:0] := SubBytes(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction name='aesenclast' form='xmm, xmm'/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aesdec_si128'>
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='RoundKey' type='__m128i'/>
+	<description>Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
+	<operation>state := a
+a[127:0] := InvShiftRows(a[127:0])
+a[127:0] := InvSubBytes(a[127:0])
+a[127:0] := InvMixColumns(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction name='aesdec' form='xmm, xmm'/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aesdeclast_si128'>
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='RoundKey' type='__m128i'/>
+	<description>Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
+	<operation>state := a
+a[127:0] := InvShiftRows(a[127:0])
+a[127:0] := InvSubBytes(a[127:0])
+dst[127:0] := a[127:0] XOR RoundKey[127:0]
+	</operation>
+	<instruction name='aesdeclast' form='xmm, xmm'/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aesimc_si128'>
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Perform the InvMixColumns transformation on "a" and store the result in "dst".</description>
+	<operation>
+dst[127:0] := InvMixColumns(a[127:0])
+	</operation>
+	<instruction name='aesimc' form='xmm, xmm'/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aeskeygenassist_si128'>
+	<type>Integer</type>
+	<CPUID>AES</CPUID>
+	<category>Cryptography</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Assist in expanding the AES cipher key by computing steps towards generating a round key for encryption cipher using data from "a" and an 8-bit round constant specified in "imm8", and store the result in "dst"."
+	</description>
+	<operation>
+X3[31:0] := a[127:96]
+X2[31:0] := a[95:64]
+X1[31:0] := a[63:32]
+X0[31:0] := a[31:0]
+RCON[31:0] := ZeroExtend(imm8[7:0]);
+dst[31:0] := SubWord(X1)
+dst[63:32] := (RotWord(SubWord(X1)) XOR RCON;
+dst[95:64] := SubWord(X3)
+dst[127:96] := RotWord(SubWord(X3)) XOR RCON;
+	</operation>
+	<instruction name='aeskeygenassist' form='xmm, xmm, imm'/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_clmulepi64_si128'>
+	<type>Integer</type>
+	<CPUID>PCLMULQDQ</CPUID>
+	<category>Application-Targeted</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname='imm8' type='const int'/>
+	<description>Perform a carry-less multiplication of two 64-bit integers, selected from "a" and "b" according to "imm8", and store the results in "dst".
+	</description>
+	<operation>
+IF (imm8[0] = 0)
+	TEMP1 := a[63:0];
+ELSE
+	TEMP1 := a[127:64];
+FI 
+IF (imm8[4] = 0)
+	TEMP2 := b[63:0];
+ELSE 
+	TEMP2 := b[127:64];
+FI
+
+FOR i := 0 to 63
+	TEMP[i] := (TEMP1[0] and TEMP2[i]);
+	FOR j := 1 to i
+		TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j])
+	ENDFOR 
+	dst[i] := TEMP[i];
+ENDFOR
+FOR i := 64 to 127
+	TEMP [i] := 0;
+	FOR j := (i - 63) to 63
+		TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j])
+	ENDFOR
+	dst[i] := TEMP[i];
+ENDFOR
+dst[127] := 0
+	</operation>
+	<instruction name='pclmulqdq' form='xmm, xmm, imm'/>
+	<header>wmmintrin.h</header>
+</intrinsic>
+
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_add_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vaddpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_add_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vaddps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_addsub_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF (j is even) 
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vaddsubpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_addsub_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF (j is even) 
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vaddsubps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_and_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+		<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vandpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_and_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vandps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_andnot_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vandnpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_andnot_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vandnps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_blend_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[j%8]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vblendpd' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_blend_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[j%8]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vblendps' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_blendv_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<parameter varname='mask' type='__m256d'/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vblendvpd' form='ymm, ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_blendv_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname='mask' type='__m256'/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vblendvps' form='ymm, ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_div_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vdivpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_div_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vdivps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_dp_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
+	<operation>
+DP(a[127:0], b[127:0], imm8[7:0]) {
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[(4+j)%8]
+			temp[i+31:i] := a[i+31:i] * b[i+31:i]
+		ELSE
+			temp[i+31:i] := 0
+		FI
+	ENDFOR
+	
+	sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])
+	
+	FOR j := 0 to 3
+		i := j*32
+		IF imm8[j%8]
+			tmpdst[i+31:i] := sum[31:0]
+		ELSE
+			tmpdst[i+31:i] := 0
+		FI
+	ENDFOR
+	RETURN tmpdst[127:0]
+}
+
+dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
+dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vdpps' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_hadd_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[127:64] + a[63:0]
+dst[127:64] := b[127:64] + b[63:0]
+dst[191:128] := a[255:192] + a[191:128]
+dst[255:192] := b[255:192] + b[191:128]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vhaddpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_hadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+dst[159:128] := a[191:160] + a[159:128]
+dst[191:160] := a[255:224] + a[223:192]
+dst[223:192] := b[191:160] + b[159:128]
+dst[255:224] := b[255:224] + b[223:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vhaddps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_hsub_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[63:0] := a[63:0] - a[127:64]
+dst[127:64] := b[63:0] - b[127:64]
+dst[191:128] := a[191:128] - a[255:192]
+dst[255:192] := b[191:128] - b[255:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vhsubpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_hsub_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+dst[159:128] := a[159:128] - a[191:160]
+dst[191:160] := a[223:192] - a[255:224]
+dst[223:192] := b[159:128] - b[191:160]
+dst[255:224] := b[223:192] - b[255:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vhsubps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_max_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmaxpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_max_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmaxps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_min_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vminpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_min_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vminps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_mul_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmulpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_mul_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmulps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_or_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vorpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_or_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vorps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_shuffle_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". </description>
+	<operation>
+dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vshufpd' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_shuffle_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vshufps' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_sub_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vsubpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_sub_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vsubps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_xor_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vxorpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_xor_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vxorps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128d' name='_mm_cmp_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcmppd' form='xmm, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_cmp_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcmppd' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128' name='_mm_cmp_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcmpps' form='xmm, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_cmp_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcmpps' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128d' name='_mm_cmp_sd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+dst[63:0] := ( a[63:0] OP b[63:0] ) ? 0xFFFFFFFFFFFFFFFF : 0
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcmpsd' form='xmm, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128' name='_mm_cmp_ss'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". </description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+dst[31:0] := ( a[31:0] OP b[31:0] ) ? 0xFFFFFFFF : 0
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcmpss' form='xmm, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_cvtepi32_pd'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtdq2pd' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_cvtepi32_ps'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m256i'/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtdq2ps' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128' name='_mm256_cvtpd_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtpd2ps' form='xmm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_cvtps_epi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtps2dq' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_cvtps_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtps2pd' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128i' name='_mm256_cvttpd_epi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvttpd2dq' form='xmm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128i' name='_mm256_cvtpd_epi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtpd2dq' form='xmm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_cvttps_epi32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttps2dq' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128' name='_mm256_extractf128_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vextractf128' form='xmm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128d' name='_mm256_extractf128_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vextractf128' form='xmm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128i' name='_mm256_extractf128_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vextractf128' form='xmm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' sequence='true' rettype='__int8' name='_mm256_extract_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="index" type='const int'/>
+	<description>Extract an 8-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[7:0] := (a[255:0] &gt;&gt; (index * 8))[7:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' sequence='true' rettype='__int16' name='_mm256_extract_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="index" type='const int'/>
+	<description>Extract a 16-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[15:0] := (a[255:0] &gt;&gt; (index * 16))[15:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__int32' name='_mm256_extract_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="index" type='const int'/>
+	<description>Extract a 32-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[31:0] := (a[255:0] &gt;&gt; (index * 32))[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__int64' name='_mm256_extract_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="index" type='const int'/>
+	<description>Extract a 64-bit integer from "a", selected with "index", and store the result in "dst".</description>
+	<operation>
+dst[63:0] := (a[255:0] &gt;&gt; (index * 64))[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_zeroall'>
+	<CPUID>AVX</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Zero the contents of all XMM or YMM registers.</description>
+	<operation>
+YMM0[MAX:0] := 0
+YMM1[MAX:0] := 0
+YMM2[MAX:0] := 0
+YMM3[MAX:0] := 0
+YMM4[MAX:0] := 0
+YMM5[MAX:0] := 0
+YMM6[MAX:0] := 0
+YMM7[MAX:0] := 0
+IF 64-bit mode
+	YMM8[MAX:0] := 0
+	YMM9[MAX:0] := 0
+	YMM10[MAX:0] := 0
+	YMM11[MAX:0] := 0
+	YMM12[MAX:0] := 0
+	YMM13[MAX:0] := 0
+	YMM14[MAX:0] := 0
+	YMM15[MAX:0] := 0
+FI
+</operation>
+	<instruction name='vzeroall' form=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_zeroupper'>
+	<CPUID>AVX</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified.</description>
+	<operation>
+YMM0[MAX:128] := 0
+YMM1[MAX:128] := 0
+YMM2[MAX:128] := 0
+YMM3[MAX:128] := 0
+YMM4[MAX:128] := 0
+YMM5[MAX:128] := 0
+YMM6[MAX:128] := 0
+YMM7[MAX:128] := 0
+IF 64-bit mode
+	YMM8[MAX:128] := 0
+	YMM9[MAX:128] := 0
+	YMM10[MAX:128] := 0
+	YMM11[MAX:128] := 0
+	YMM12[MAX:128] := 0
+	YMM13[MAX:128] := 0
+	YMM14[MAX:128] := 0
+	YMM15[MAX:128] := 0
+FI
+</operation>
+	<instruction name='vzeroupper' form=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_permutevar_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], b[1:0])
+dst[63:32] := SELECT4(a[127:0], b[33:32])
+dst[95:64] := SELECT4(a[127:0], b[65:64])
+dst[127:96] := SELECT4(a[127:0], b[97:96])
+dst[159:128] := SELECT4(a[255:128], b[129:128])
+dst[191:160] := SELECT4(a[255:128], b[161:160])
+dst[223:192] := SELECT4(a[255:128], b[193:192])
+dst[255:224] := SELECT4(a[255:128], b[225:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpermilps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128' name='_mm_permutevar_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], b[1:0])
+dst[63:32] := SELECT4(a[127:0], b[33:32])
+dst[95:64] := SELECT4(a[127:0], b[65:64])
+dst[127:96] := SELECT4(a[127:0], b[97:96])
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpermilps' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_permute_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpermilps' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128' name='_mm_permute_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpermilps' form='xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_permutevar_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+IF (b[1] == 0) dst[63:0] := a[63:0]
+IF (b[1] == 1) dst[63:0] := a[127:64]
+IF (b[65] == 0) dst[127:64] := a[63:0]
+IF (b[65] == 1) dst[127:64] := a[127:64]
+IF (b[129] == 0) dst[191:128] := a[191:128]
+IF (b[129] == 1) dst[191:128] := a[255:192]
+IF (b[193] == 0) dst[255:192] := a[191:128]
+IF (b[193] == 1) dst[255:192] := a[255:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpermilpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128d' name='_mm_permutevar_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst".</description>
+	<operation>
+IF (b[1] == 0) dst[63:0] := a[63:0]
+IF (b[1] == 1) dst[63:0] := a[127:64]
+IF (b[65] == 0) dst[127:64] := a[63:0]
+IF (b[65] == 1) dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpermilpd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_permute_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+IF (imm8[0] == 0) dst[63:0] := a[63:0]
+IF (imm8[0] == 1) dst[63:0] := a[127:64]
+IF (imm8[1] == 0) dst[127:64] := a[63:0]
+IF (imm8[1] == 1) dst[127:64] := a[127:64]
+IF (imm8[2] == 0) dst[191:128] := a[191:128]
+IF (imm8[2] == 1) dst[191:128] := a[255:192]
+IF (imm8[3] == 0) dst[255:192] := a[191:128]
+IF (imm8[3] == 1) dst[255:192] := a[255:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpermilpd' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128d' name='_mm_permute_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+IF (imm8[0] == 0) dst[63:0] := a[63:0]
+IF (imm8[0] == 1) dst[63:0] := a[127:64]
+IF (imm8[1] == 0) dst[127:64] := a[63:0]
+IF (imm8[1] == 1) dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpermilpd' form='xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_permute2f128_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". </description>
+	<operation>
+SELECT4(src1, src2, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vperm2f128' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_permute2f128_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". </description>
+	<operation>
+SELECT4(src1, src2, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vperm2f128' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_permute2f128_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". </description>
+	<operation>
+SELECT4(src1, src2, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vperm2f128' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_broadcast_ss'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='float const *'/>
+	<description>Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst".</description>
+	<operation>
+tmp[31:0] = MEM[mem_addr+31:mem_addr]
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vbroadcastss' form='ymm, m32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128' name='_mm_broadcast_ss'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname='mem_addr' type='float const *'/>
+	<description>Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst".</description>
+	<operation>
+tmp[31:0] = MEM[mem_addr+31:mem_addr]
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vbroadcastss' form='xmm, m32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_broadcast_sd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname='mem_addr' type='double const *'/>
+	<description>Broadcast a double-precision (64-bit) floating-point element from memory to all elements of "dst".</description>
+	<operation>
+tmp[63:0] = MEM[mem_addr+63:mem_addr]
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vbroadcastsd' form='ymm, m64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_broadcast_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname='mem_addr' type='__m128 const *'/>
+	<description>Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of "dst".</description>
+	<operation>
+tmp[127:0] = MEM[mem_addr+127:mem_addr]
+dst[127:0] := tmp[127:0]
+dst[255:128] := tmp[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vbroadcastf128' form='ymm, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_broadcast_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname='mem_addr' type='__m128d const *'/>
+	<description>Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of "dst".</description>
+	<operation>
+tmp[127:0] = MEM[mem_addr+127:mem_addr]
+dst[127:0] := tmp[127:0]
+dst[255:128] := tmp[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vbroadcastf128' form='ymm, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_insertf128_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_insertf128_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE imm8[7:0] of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_insertf128_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Copy "a" to "dst", then insert 128 bits from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_insert_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='i' type='__int8'/>
+	<parameter varname="index" type='const int'/>
+	<description>Copy "a" to "dst", and insert the 8-bit integer "i" into "dst" at the location specified by "index". </description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index*8
+dst[sel+7:sel] := i[7:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_insert_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='i' type='__int16'/>
+	<parameter varname="index" type='const int'/>
+	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "index". </description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index*16
+dst[sel+15:sel] := i[15:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_insert_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='i' type='__int32'/>
+	<parameter varname="index" type='const int'/>
+	<description>Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "index". </description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index*32
+dst[sel+31:sel] := i[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_insert_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='i' type='__int64'/>
+	<parameter varname="index" type='const int'/>
+	<description>Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "index". </description>
+	<operation>
+dst[255:0] := a[255:0]
+sel := index*64
+dst[sel+63:sel] := i[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_load_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='double const *'/>
+	<description>Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmovapd' form='ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_store_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double *'/>
+	<parameter varname='a' type='__m256d'/>
+	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name='vmovapd' form='m256, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_load_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='float const *'/>
+	<description>Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmovaps' form='ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_store_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float *'/>
+	<parameter varname='a' type='__m256'/>
+	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name='vmovaps' form='m256, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_loadu_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='double const *'/>
+	<description>Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmovupd' form='ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_storeu_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double *'/>
+	<parameter varname='a' type='__m256d'/>
+	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name='vmovupd' form='m256, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_loadu_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='float const *'/>
+	<description>Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmovups' form='ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_storeu_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float *'/>
+	<parameter varname='a' type='__m256'/>
+	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name='vmovups' form='m256, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_load_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='__m256i const *'/>
+	<description>Load 256-bits of integer data from memory into "dst".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmovdqa' form='ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_store_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__m256i *'/>
+	<parameter varname='a' type='__m256i'/>
+	<description>Store 256-bits of integer data from "a" into memory.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name='vmovdqa' form='m256, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_loadu_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='__m256i const *'/>
+	<description>Load 256-bits of integer data from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmovdqu' form='ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_storeu_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__m256i *'/>
+	<parameter varname='a' type='__m256i'/>
+	<description>Store 256-bits of integer data from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name='vmovdqu' form='m256, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_maskload_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='double const *'/>
+	<parameter varname='mask' type='__m256i'/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmaskmovpd' form='ymm, ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_maskstore_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double *'/>
+	<parameter varname='mask' type='__m256i'/>
+	<parameter varname='a' type='__m256d'/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmaskmovpd' form='m256, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128d' name='_mm_maskload_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='double const *'/>
+	<parameter varname='mask' type='__m128i'/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaskmovpd' form='xmm, xmm, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm_maskstore_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double *'/>
+	<parameter varname='mask' type='__m128i'/>
+	<parameter varname='a' type='__m128d'/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmaskmovpd' form='m128, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_maskload_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='float const *'/>
+	<parameter varname='mask' type='__m256i'/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmaskmovps' form='ymm, ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_maskstore_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float *'/>
+	<parameter varname='mask' type='__m256i'/>
+	<parameter varname='a' type='__m256'/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmaskmovps' form='m256, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128' name='_mm_maskload_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='float const *'/>
+	<parameter varname='mask' type='__m128i'/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaskmovps' form='xmm, xmm, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm_maskstore_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float *'/>
+	<parameter varname='mask' type='__m128i'/>
+	<parameter varname='a' type='__m128'/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmaskmovps' form='m128, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_movehdup_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".
+	</description>
+	<operation>
+dst[31:0] := a[63:32] 
+dst[63:32] := a[63:32] 
+dst[95:64] := a[127:96] 
+dst[127:96] := a[127:96]
+dst[159:128] := a[191:160] 
+dst[191:160] := a[191:160] 
+dst[223:192] := a[255:224] 
+dst[255:224] := a[255:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmovshdup' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_moveldup_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".
+	</description>
+	<operation>
+dst[31:0] := a[31:0] 
+dst[63:32] := a[31:0] 
+dst[95:64] := a[95:64] 
+dst[127:96] := a[95:64]
+dst[159:128] := a[159:128] 
+dst[191:160] := a[159:128] 
+dst[223:192] := a[223:192] 
+dst[255:224] := a[223:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmovsldup' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_movedup_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Move</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst".
+	</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[127:64] := a[63:0]
+dst[191:128] := a[191:128]
+dst[255:192] := a[191:128]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmovddup' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_lddqu_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='__m256i const *'/>
+	<description>Load 256-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm256_loadu_si256" when the data crosses a cache line boundary.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vlddqu' form='ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_stream_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__m256i *'/>
+	<parameter varname='a' type='__m256i'/>
+	<description>Store 256-bits of integer data from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name='vmovntdq' form='m256, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_stream_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='double *'/>
+	<parameter varname='a' type='__m256d'/>
+	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name='vmovntpd' form='m256, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='void' name='_mm256_stream_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='float *'/>
+	<parameter varname='a' type='__m256'/>
+	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+255:mem_addr] := a[255:0]
+	</operation>
+	<instruction name='vmovntps' form='m256, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_rcp_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vrcpps' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_rsqrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vrsqrtps' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_sqrt_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vsqrtpd' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_sqrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vsqrtps' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_round_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='rounding' type='int'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vroundpd' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_round_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='rounding' type='int'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vroundps' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_unpackhi_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vunpckhpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_unpackhi_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vunpckhps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_unpacklo_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vunpcklpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_unpacklo_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vunpcklps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm256_testz_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+IF (a[255:0] AND b[255:0] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF ((NOT a[255:0]) AND b[255:0] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN ZF
+	</operation>
+	<instruction name='vptest' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm256_testc_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+IF (a[255:0] AND b[255:0] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF ((NOT a[255:0]) AND b[255:0] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN CF
+	</operation>
+	<instruction name='vptest' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm256_testnzc_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+IF (a[255:0] AND b[255:0] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+IF ((NOT a[255:0]) AND b[255:0] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	RETURN 1
+ELSE
+	RETURN 0
+FI
+	</operation>
+	<instruction name='vptest' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm256_testz_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN ZF
+	</operation>
+	<instruction name='vtestpd' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm256_testc_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN CF
+	</operation>
+	<instruction name='vtestpd' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm256_testnzc_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	RETURN 1
+ELSE
+	RETURN 0
+FI
+	</operation>
+	<instruction name='vtestpd' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm_testz_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[63] == tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[63] == tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN ZF
+	</operation>
+	<instruction name='vtestpd' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm_testc_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[63] == tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[63] == tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN CF
+	</operation>
+	<instruction name='vtestpd' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm_testnzc_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[63] == tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[63] == tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	RETURN 1
+ELSE
+	RETURN 0
+FI
+	</operation>
+	<instruction name='vtestpd' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm256_testz_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN ZF
+	</operation>
+	<instruction name='vtestps' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm256_testc_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN CF
+	</operation>
+	<instruction name='vtestps' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm256_testnzc_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[255:0] := a[255:0] AND b[255:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255]  == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[255:0] := (NOT a[255:0]) AND b[255:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255]  == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	RETURN 1
+ELSE
+	RETURN 0
+FI
+	</operation>
+	<instruction name='vtestps' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm_testz_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN ZF
+	</operation>
+	<instruction name='vtestps' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm_testc_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+RETURN CF
+	</operation>
+	<instruction name='vtestps' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm_testnzc_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
+	<operation>
+tmp[127:0] := a[127:0] AND b[127:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
+	ZF := 1
+ELSE
+	ZF := 0
+FI
+tmp[127:0] := (NOT a[127:0]) AND b[127:0]
+IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
+	CF := 1
+ELSE
+	CF := 0
+FI
+IF (ZF == 0 &amp;&amp; CF == 0)
+	RETURN 1
+ELSE
+	RETURN 0
+FI
+	</operation>
+	<instruction name='vtestps' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm256_movemask_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF a[i+63]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:4] := 0
+	</operation>
+	<instruction name='vmovmskpd' form='r32, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='int' name='_mm256_movemask_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF a[i+31]
+		dst[j] := 1
+	ELSE
+		dst[j] := 0
+	FI
+ENDFOR
+dst[MAX:8] := 0
+	</operation>
+	<instruction name='vmovmskps' form='r32, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_setzero_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='' type='void'/>
+	<description>Return vector of type __m256d with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='vxorpd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_setzero_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='' type='void'/>
+	<description>Return vector of type __m256 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='vxorps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_setzero_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='' type='void'/>
+	<description>Return vector of type __m256i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='vpxor' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256d' name='_mm256_set_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e3' type='double'/>
+	<parameter varname='e2' type='double'/>
+	<parameter varname='e1' type='double'/>
+	<parameter varname='e0' type='double'/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256' name='_mm256_set_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e7' type='float'/>
+	<parameter varname='e6' type='float'/>
+	<parameter varname='e5' type='float'/>
+	<parameter varname='e4' type='float'/>
+	<parameter varname='e3' type='float'/>
+	<parameter varname='e2' type='float'/>
+	<parameter varname='e1' type='float'/>
+	<parameter varname='e0' type='float'/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e31' type='char'/>
+	<parameter varname='e30' type='char'/>
+	<parameter varname='e29' type='char'/>
+	<parameter varname='e28' type='char'/>
+	<parameter varname='e27' type='char'/>
+	<parameter varname='e26' type='char'/>
+	<parameter varname='e25' type='char'/>
+	<parameter varname='e24' type='char'/>
+	<parameter varname='e23' type='char'/>
+	<parameter varname='e22' type='char'/>
+	<parameter varname='e21' type='char'/>
+	<parameter varname='e20' type='char'/>
+	<parameter varname='e19' type='char'/>
+	<parameter varname='e18' type='char'/>
+	<parameter varname='e17' type='char'/>
+	<parameter varname='e16' type='char'/>
+	<parameter varname='e15' type='char'/>
+	<parameter varname='e14' type='char'/>
+	<parameter varname='e13' type='char'/>
+	<parameter varname='e12' type='char'/>
+	<parameter varname='e11' type='char'/>
+	<parameter varname='e10' type='char'/>
+	<parameter varname='e9' type='char'/>
+	<parameter varname='e8' type='char'/>
+	<parameter varname='e7' type='char'/>
+	<parameter varname='e6' type='char'/>
+	<parameter varname='e5' type='char'/>
+	<parameter varname='e4' type='char'/>
+	<parameter varname='e3' type='char'/>
+	<parameter varname='e2' type='char'/>
+	<parameter varname='e1' type='char'/>
+	<parameter varname='e0' type='char'/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+dst[71:64] := e8
+dst[79:72] := e9
+dst[87:80] := e10
+dst[95:88] := e11
+dst[103:96] := e12
+dst[111:104] := e13
+dst[119:112] := e14
+dst[127:120] := e15
+dst[135:128] := e16
+dst[143:136] := e17
+dst[151:144] := e18
+dst[159:152] := e19
+dst[167:160] := e20
+dst[175:168] := e21
+dst[183:176] := e22
+dst[191:184] := e23
+dst[199:192] := e24
+dst[207:200] := e25
+dst[215:208] := e26
+dst[223:216] := e27
+dst[231:224] := e28
+dst[239:232] := e29
+dst[247:240] := e30
+dst[255:248] := e31
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e15' type='short'/>
+	<parameter varname='e14' type='short'/>
+	<parameter varname='e13' type='short'/>
+	<parameter varname='e12' type='short'/>
+	<parameter varname='e11' type='short'/>
+	<parameter varname='e10' type='short'/>
+	<parameter varname='e9' type='short'/>
+	<parameter varname='e8' type='short'/>
+	<parameter varname='e7' type='short'/>
+	<parameter varname='e6' type='short'/>
+	<parameter varname='e5' type='short'/>
+	<parameter varname='e4' type='short'/>
+	<parameter varname='e3' type='short'/>
+	<parameter varname='e2' type='short'/>
+	<parameter varname='e1' type='short'/>
+	<parameter varname='e0' type='short'/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+dst[79:64] := e4
+dst[95:80] := e5
+dst[111:96] := e6
+dst[127:112] := e7
+dst[145:128] := e8
+dst[159:144] := e9
+dst[175:160] := e10
+dst[191:176] := e11
+dst[207:192] := e12
+dst[223:208] := e13
+dst[239:224] := e14
+dst[255:240] := e15
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e7' type='int'/>
+	<parameter varname='e6' type='int'/>
+	<parameter varname='e5' type='int'/>
+	<parameter varname='e4' type='int'/>
+	<parameter varname='e3' type='int'/>
+	<parameter varname='e2' type='int'/>
+	<parameter varname='e1' type='int'/>
+	<parameter varname='e0' type='int'/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set_epi64x'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e3' type='__int64'/>
+	<parameter varname='e2' type='__int64'/>
+	<parameter varname='e1' type='__int64'/>
+	<parameter varname='e0' type='__int64'/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256d' name='_mm256_setr_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e3' type='double'/>
+	<parameter varname='e2' type='double'/>
+	<parameter varname='e1' type='double'/>
+	<parameter varname='e0' type='double'/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e3
+dst[127:64] := e2
+dst[191:128] := e1
+dst[255:192] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256' name='_mm256_setr_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e7' type='float'/>
+	<parameter varname='e6' type='float'/>
+	<parameter varname='e5' type='float'/>
+	<parameter varname='e4' type='float'/>
+	<parameter varname='e3' type='float'/>
+	<parameter varname='e2' type='float'/>
+	<parameter varname='e1' type='float'/>
+	<parameter varname='e0' type='float'/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e7
+dst[63:32] := e6
+dst[95:64] := e5
+dst[127:96] := e4
+dst[159:128] := e3
+dst[191:160] := e2
+dst[223:192] := e1
+dst[255:224] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_setr_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e31' type='char'/>
+	<parameter varname='e30' type='char'/>
+	<parameter varname='e29' type='char'/>
+	<parameter varname='e28' type='char'/>
+	<parameter varname='e27' type='char'/>
+	<parameter varname='e26' type='char'/>
+	<parameter varname='e25' type='char'/>
+	<parameter varname='e24' type='char'/>
+	<parameter varname='e23' type='char'/>
+	<parameter varname='e22' type='char'/>
+	<parameter varname='e21' type='char'/>
+	<parameter varname='e20' type='char'/>
+	<parameter varname='e19' type='char'/>
+	<parameter varname='e18' type='char'/>
+	<parameter varname='e17' type='char'/>
+	<parameter varname='e16' type='char'/>
+	<parameter varname='e15' type='char'/>
+	<parameter varname='e14' type='char'/>
+	<parameter varname='e13' type='char'/>
+	<parameter varname='e12' type='char'/>
+	<parameter varname='e11' type='char'/>
+	<parameter varname='e10' type='char'/>
+	<parameter varname='e9' type='char'/>
+	<parameter varname='e8' type='char'/>
+	<parameter varname='e7' type='char'/>
+	<parameter varname='e6' type='char'/>
+	<parameter varname='e5' type='char'/>
+	<parameter varname='e4' type='char'/>
+	<parameter varname='e3' type='char'/>
+	<parameter varname='e2' type='char'/>
+	<parameter varname='e1' type='char'/>
+	<parameter varname='e0' type='char'/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e31
+dst[15:8] := e30
+dst[23:16] := e29
+dst[31:24] := e28
+dst[39:32] := e27
+dst[47:40] := e26
+dst[55:48] := e25
+dst[63:56] := e24
+dst[71:64] := e23
+dst[79:72] := e22
+dst[87:80] := e21
+dst[95:88] := e20
+dst[103:96] := e19
+dst[111:104] := e18
+dst[119:112] := e17
+dst[127:120] := e16
+dst[135:128] := e15
+dst[143:136] := e14
+dst[151:144] := e13
+dst[159:152] := e12
+dst[167:160] := e11
+dst[175:168] := e10
+dst[183:176] := e9
+dst[191:184] := e8
+dst[199:192] := e7
+dst[207:200] := e6
+dst[215:208] := e5
+dst[223:216] := e4
+dst[231:224] := e3
+dst[239:232] := e2
+dst[247:240] := e1
+dst[255:248] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_setr_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e15' type='short'/>
+	<parameter varname='e14' type='short'/>
+	<parameter varname='e13' type='short'/>
+	<parameter varname='e12' type='short'/>
+	<parameter varname='e11' type='short'/>
+	<parameter varname='e10' type='short'/>
+	<parameter varname='e9' type='short'/>
+	<parameter varname='e8' type='short'/>
+	<parameter varname='e7' type='short'/>
+	<parameter varname='e6' type='short'/>
+	<parameter varname='e5' type='short'/>
+	<parameter varname='e4' type='short'/>
+	<parameter varname='e3' type='short'/>
+	<parameter varname='e2' type='short'/>
+	<parameter varname='e1' type='short'/>
+	<parameter varname='e0' type='short'/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[15:0] := e15
+dst[31:16] := e14
+dst[47:32] := e13
+dst[63:48] := e12
+dst[79:64] := e11
+dst[95:80] := e10
+dst[111:96] := e9
+dst[127:112] := e8
+dst[145:128] := e7
+dst[159:144] := e6
+dst[175:160] := e5
+dst[191:176] := e4
+dst[207:192] := e3
+dst[223:208] := e2
+dst[239:224] := e1
+dst[255:240] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_setr_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e7' type='int'/>
+	<parameter varname='e6' type='int'/>
+	<parameter varname='e5' type='int'/>
+	<parameter varname='e4' type='int'/>
+	<parameter varname='e3' type='int'/>
+	<parameter varname='e2' type='int'/>
+	<parameter varname='e1' type='int'/>
+	<parameter varname='e0' type='int'/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e7
+dst[63:32] := e6
+dst[95:64] := e5
+dst[127:96] := e4
+dst[159:128] := e3
+dst[191:160] := e2
+dst[223:192] := e1
+dst[255:224] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_setr_epi64x'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='e3' type='__int64'/>
+	<parameter varname='e2' type='__int64'/>
+	<parameter varname='e1' type='__int64'/>
+	<parameter varname='e0' type='__int64'/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e3
+dst[127:64] := e2
+dst[191:128] := e1
+dst[255:192] := e0
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256d' name='_mm256_set1_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='double'/>
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256' name='_mm256_set1_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='float'/>
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set1_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='char'/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastb".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set1_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='short'/>
+	<description>Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate the "vpbroadcastw".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set1_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='int'/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastd".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set1_epi64x'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='a' type='long long'/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_castpd_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Cast vector of type __m256d to type __m256.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_castps_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Cast vector of type __m256 to type __m256d.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_castps_si256'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Casts vector of type __m256 to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_castpd_si256'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Casts vector of type __m256d to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_castsi256_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256i'/>
+	<description>Casts vector of type __m256i to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_castsi256_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256i'/>
+	<description>Casts vector of type __m256i to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128' name='_mm256_castps256_ps128'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Casts vector of type __m256 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128d' name='_mm256_castpd256_pd128'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Casts vector of type __m256d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m128i' name='_mm256_castsi256_si128'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256i'/>
+	<description>Casts vector of type __m256i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_castps128_ps256'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Casts vector of type __m128 to type __m256; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_castpd128_pd256'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Casts vector of type __m128d to type __m256d; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_castsi128_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Casts vector of type __m128i to type __m256i; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_zextps128_ps256'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Casts vector of type __m128 to type __m256; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_zextpd128_pd256'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Casts vector of type __m128d to type __m256d; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_zextsi128_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Casts vector of type __m128i to type __m256i; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_floor_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vroundps' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_ceil_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vroundps' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_floor_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vroundpd' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_ceil_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vroundpd' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE' rettype='__m128' name='_mm_undefined_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Return vector of type __m128 with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128d' name='_mm_undefined_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Return vector of type __m128d with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SSE2' rettype='__m128i' name='_mm_undefined_si128'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Return vector of type __m128i with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_undefined_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Return vector of type __m256 with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_undefined_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Return vector of type __m256d with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_undefined_si256'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Return vector of type __m256i with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='void' name='_mm_clflushopt'>
+	<CPUID>CLFLUSHOPT</CPUID>
+	<category>General Support</category>
+	<parameter varname='p' type='void const *'/>
+	<description>Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy.</description>
+	<instruction name='clflushopt' />
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='void' name='_mm_clwb'>
+	<CPUID>CLWB</CPUID>
+	<category>General Support</category>
+	<parameter varname='p' type='void const *'/>
+	<description>Write back to memory the cache line that contains "p" from any level of the cache hierarchy in the cache coherence domain.</description>
+	<instruction name='clwb' />
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_set_m128'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='hi' type='__m128'/>
+	<parameter varname='lo' type='__m128'/>
+	<description>Set packed __m256 vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_set_m128d'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='hi' type='__m128d'/>
+	<parameter varname='lo' type='__m128d'/>
+	<description>Set packed __m256d vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_set_m128i'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='hi' type='__m128i'/>
+	<parameter varname='lo' type='__m128i'/>
+	<description>Set packed __m256i vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256' name='_mm256_setr_m128'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='lo' type='__m128'/>
+	<parameter varname='hi' type='__m128'/>
+	<description>Set packed __m256 vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256d' name='_mm256_setr_m128d'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='lo' type='__m128d'/>
+	<parameter varname='hi' type='__m128d'/>
+	<description>Set packed __m256d vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' rettype='__m256i' name='_mm256_setr_m128i'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Set</category>
+	<parameter varname='lo' type='__m128i'/>
+	<parameter varname='hi' type='__m128i'/>
+	<description>Set packed __m256i vector "dst" with the supplied values.</description>
+	<operation>
+dst[127:0] := lo[127:0]
+dst[255:128] := hi[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256' name='_mm256_loadu2_m128'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='hiaddr' type='float const*'/>
+	<parameter varname='loaddr' type='float const*'/>
+	<description>Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst".
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[loaddr+127:loaddr]
+dst[255:128] := MEM[hiaddr+127:hiaddr]
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256d' name='_mm256_loadu2_m128d'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='hiaddr' type='double const*'/>
+	<parameter varname='loaddr' type='double const*'/>
+	<description>Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst".
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[loaddr+127:loaddr]
+dst[255:128] := MEM[hiaddr+127:hiaddr]
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_loadu2_m128i'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Load</category>
+	<parameter varname='hiaddr' type='__m128i const*'/>
+	<parameter varname='loaddr' type='__m128i const*'/>
+	<description>Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in "dst".
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[127:0] := MEM[loaddr+127:loaddr]
+dst[255:128] := MEM[hiaddr+127:hiaddr]
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='void' name='_mm256_storeu2_m128'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='hiaddr' type='float*'/>
+	<parameter varname='loaddr' type='float*'/>
+	<parameter varname='a' type='__m256' />
+	<description>Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory two different 128-bit locations.
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[loaddr+127:loaddr] := a[127:0]
+MEM[hiaddr+127:hiaddr] := a[255:128]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='void' name='_mm256_storeu2_m128d'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='hiaddr' type='double*'/>
+	<parameter varname='loaddr' type='double*'/>
+	<parameter varname='a' type='__m256d' />
+	<description>Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory two different 128-bit locations.
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[loaddr+127:loaddr] := a[127:0]
+MEM[hiaddr+127:hiaddr] := a[255:128]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' sequence='true' rettype='void' name='_mm256_storeu2_m128i'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Store</category>
+	<parameter varname='hiaddr' type='__m128i*'/>
+	<parameter varname='loaddr' type='__m128i*'/>
+	<parameter varname='a' type='__m256i' />
+	<description>Store the high and low 128-bit halves (each composed of integer data) from "a" into memory two different 128-bit locations.
+	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[loaddr+127:loaddr] := a[127:0]
+MEM[hiaddr+127:hiaddr] := a[255:128]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_abs_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := ABS(a[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpabsb' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_abs_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ABS(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpabsw' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_abs_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpabsd' form='ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_add_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpaddb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_add_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpaddw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_add_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpaddd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_add_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpaddq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_adds_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpaddsb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_adds_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpaddsw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_adds_epu8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpaddusb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_adds_epu16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpaddusw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_alignr_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<parameter varname='count' type='const int'/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128) OR b[i+127:i]) &gt;&gt; (count[7:0]*8)
+	dst[i+127:i] := tmp[127:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpalignr' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_and_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := (a[255:0] AND b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpand' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_andnot_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compute the bitwise NOT of 256 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := ((NOT a[255:0]) AND b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpandn' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_avg_epu8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpavgb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_avg_epu16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpavgw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_blend_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[j%8]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpblendw' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_blend_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF imm8[j%8]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpblendd' form='xmm, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_blend_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[j%8]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpblendd' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_blendv_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<parameter varname='mask' type='__m256i'/>
+	<description>Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF mask[i+7]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpblendvb' form='ymm, ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_broadcastb_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpbroadcastb' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_broadcastb_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpbroadcastb' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_broadcastd_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpbroadcastd' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_broadcastd_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpbroadcastd' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_broadcastq_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpbroadcastq' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_broadcastq_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpbroadcastq' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' vexEq='TRUE' rettype='__m128d' name='_mm_broadcastsd_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='movddup' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_broadcastsd_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vbroadcastsd' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm_broadcastsi128_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst".
+	</description>
+	<operation>
+dst[127:0] := a[127:0]
+dst[255:128] := a[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vbroadcasti128' form='ymm, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_broadcastsi128_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst".
+	</description>
+	<operation>
+dst[127:0] := a[127:0]
+dst[255:128] := a[127:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vbroadcasti128' form='ymm, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128' name='_mm_broadcastss_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vbroadcastss' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256' name='_mm256_broadcastss_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vbroadcastss' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_broadcastw_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpbroadcastw' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_broadcastw_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpbroadcastw' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpeq_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpcmpeqb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpeq_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpcmpeqw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpeq_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpcmpeqd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpeq_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpcmpeqq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpgt_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpcmpgtb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpgt_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpcmpgtw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpgt_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpcmpgtd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpgt_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Compare</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ( a[i+63:i] &gt; b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpcmpgtq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi16_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := SignExtend(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsxwd' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi16_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := SignExtend(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsxwq' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi32_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := SignExtend(a[k+31:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsxdq' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi8_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	dst[l+15:l] := SignExtend(a[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsxbw' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi8_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := SignExtend(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsxbd' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi8_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := SignExtend(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsxbq' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu16_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := ZeroExtend(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovzxwd' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu16_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := ZeroExtend(a[k+15:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovzxwq' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu32_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j:= 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := ZeroExtend(a[k+31:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovzxdq' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu8_epi16'>
+	<type>Integer</type>
+		 <CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	dst[l+15:l] := ZeroExtend(a[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovzxbw' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu8_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := ZeroExtend(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovzxbd' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu8_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := ZeroExtend(a[k+7:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovzxbq' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm256_extracti128_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vextracti128' form='xmm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hadd_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[31:16] + a[15:0]
+dst[31:16] := a[63:48] + a[47:32]
+dst[47:32] := a[95:80] + a[79:64]
+dst[63:48] := a[127:112] + a[111:96]
+dst[79:64] := b[31:16] + b[15:0]
+dst[95:80] := b[63:48] + b[47:32]
+dst[111:96] := b[95:80] + b[79:64]
+dst[127:112] := b[127:112] + b[111:96]
+dst[143:128] := a[159:144] + a[143:128]
+dst[159:144] := a[191:176] + a[175:160]
+dst[175:160] := a[223:208] + a[207:192]
+dst[191:176] := a[255:240] + a[239:224]
+dst[207:192] := b[127:112] + b[143:128]
+dst[223:208] := b[159:144] + b[175:160]
+dst[239:224] := b[191:176] + b[207:192]
+dst[255:240] := b[223:208] + b[239:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vphaddw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hadd_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[63:32] + a[31:0]
+dst[63:32] := a[127:96] + a[95:64]
+dst[95:64] := b[63:32] + b[31:0]
+dst[127:96] := b[127:96] + b[95:64]
+dst[159:128] := a[191:160] + a[159:128]
+dst[191:160] := a[255:224] + a[223:192]
+dst[223:192] := b[191:160] + b[159:128]
+dst[255:224] := b[255:224] + b[223:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vphaddd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hadds_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0])
+dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32])
+dst[47:32] = Saturate_To_Int16(a[95:80] + a[79:64])
+dst[63:48] = Saturate_To_Int16(a[127:112] + a[111:96])
+dst[79:64] = Saturate_To_Int16(b[31:16] + b[15:0])
+dst[95:80] = Saturate_To_Int16(b[63:48] + b[47:32])
+dst[111:96] = Saturate_To_Int16(b[95:80] + b[79:64])
+dst[127:112] = Saturate_To_Int16(b[127:112] + b[111:96])
+dst[143:128] = Saturate_To_Int16(a[159:144] + a[143:128])
+dst[159:144] = Saturate_To_Int16(a[191:176] + a[175:160])
+dst[175:160] = Saturate_To_Int16( a[223:208] + a[207:192])
+dst[191:176] = Saturate_To_Int16(a[255:240] + a[239:224])
+dst[207:192] = Saturate_To_Int16(b[127:112] + b[143:128])
+dst[223:208] = Saturate_To_Int16(b[159:144] + b[175:160])
+dst[239:224] = Saturate_To_Int16(b[191-160] + b[159-128])
+dst[255:240] = Saturate_To_Int16(b[255:240] + b[239:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vphaddsw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hsub_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0] := a[15:0] - a[31:16]
+dst[31:16] := a[47:32] - a[63:48]
+dst[47:32] := a[79:64] - a[95:80]
+dst[63:48] := a[111:96] - a[127:112]
+dst[79:64] := b[15:0] - b[31:16]
+dst[95:80] := b[47:32] - b[63:48]
+dst[111:96] := b[79:64] - b[95:80]
+dst[127:112] := b[111:96] - b[127:112]
+dst[143:128] := a[143:128] - a[159:144]
+dst[159:144] := a[175:160] - a[191:176]
+dst[175:160] := a[207:192] - a[223:208]
+dst[191:176] := a[239:224] - a[255:240]
+dst[207:192] := b[143:128] - b[159:144]
+dst[223:208] := b[175:160] - b[191:176]
+dst[239:224] := b[207:192] - b[223:208]
+dst[255:240] := b[239:224] - b[255:240]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vphsubw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hsub_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
+	<operation>
+dst[31:0] := a[31:0] - a[63:32]
+dst[63:32] := a[95:64] - a[127:96]
+dst[95:64] := b[31:0] - b[63:32]
+dst[127:96] := b[95:64] - b[127:96]
+dst[159:128] := a[159:128] - a[191:160]
+dst[191:160] := a[223:192] - a[255:224]
+dst[223:192] := b[159:128] - b[191:160]
+dst[255:224] := b[223:192] - b[255:224]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vphsubd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hsubs_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
+	<operation>
+dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16])
+dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48])
+dst[47:32] = Saturate_To_Int16(a[79:64] - a[95:80])
+dst[63:48] = Saturate_To_Int16(a[111:96] - a[127:112])
+dst[79:64] = Saturate_To_Int16(b[15:0] - b[31:16])
+dst[95:80] = Saturate_To_Int16(b[47:32] - b[63:48])
+dst[111:96] = Saturate_To_Int16(b[79:64] - b[95:80])
+dst[127:112] = Saturate_To_Int16(b[111:96] - b[127:112])
+dst[143:128]= Saturate_To_Int16(a[143:128] - a[159:144])
+dst[159:144] = Saturate_To_Int16(a[175:160] - a[191:176])
+dst[175:160] = Saturate_To_Int16(a[207:192] - a[223:208])
+dst[191:176] = Saturate_To_Int16(a[239:224] - a[255:240])
+dst[207:192] = Saturate_To_Int16(b[143:128] - b[159:144])
+dst[223:208] = Saturate_To_Int16(b[175:160] - b[191:176])
+dst[239:224] = Saturate_To_Int16(b[207:192] - b[223:208])
+dst[255:240] = Saturate_To_Int16(b[239:224] - b[255:240])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vphsubsw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128d' name='_mm_i32gather_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='double const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgatherdpd' form='xmm, vm32x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_i32gather_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='double const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vgatherdpd' form='ymm, vm32x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128' name='_mm_i32gather_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='float const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgatherdps' form='xmm, vm32x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256' name='_mm256_i32gather_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='float const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vgatherdps' form='ymm, vm32x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_i32gather_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='int const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpgatherdd' form='xmm, vm32x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_i32gather_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='int const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpgatherdd' form='ymm, vm32x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_i32gather_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='__int64 const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpgatherdq' form='xmm, vm32x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_i32gather_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='__int64 const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpgatherdq' form='ymm, vm32x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128d' name='_mm_i64gather_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='double const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgatherqpd' form='xmm, vm64x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_i64gather_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='double const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vgatherqpd' form='ymm, vm64x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128' name='_mm_i64gather_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='float const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vgatherqps' form='xmm, vm64x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128' name='_mm256_i64gather_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='float const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgatherqps' form='ymm, vm64x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_i64gather_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='int const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vpgatherqd' form='xmm, vm64x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm256_i64gather_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='int const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpgatherqd' form='ymm, vm64x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_i64gather_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='__int64 const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpgatherqq' form='xmm, vm64x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_i64gather_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='base_addr' type='__int64 const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpgatherqq' form='ymm, vm64x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_inserti128_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m128i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of integer data) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vinserti128' form='ymm, ymm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_madd_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmaddwd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_maddubs_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmaddubsw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128d' name='_mm_mask_i32gather_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m128d'/>
+	<parameter varname='base_addr' type='double const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='mask' type='__m128d'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+		mask[i+63] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgatherdpd' form='xmm, vm32x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_mask_i32gather_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m256d'/>
+	<parameter varname='base_addr' type='double const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='mask' type='__m256d'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+		mask[i+63] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vgatherdpd' form='ymm, vm32x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128' name='_mm_mask_i32gather_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m128'/>
+	<parameter varname='base_addr' type='float const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='mask' type='__m128'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+		mask[i+31] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgatherdps' form='xmm, vm32x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256' name='_mm256_mask_i32gather_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m256'/>
+	<parameter varname='base_addr' type='float const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='mask' type='__m256'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+		mask[i+31] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vgatherdps' form='ymm, vm32x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_mask_i32gather_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m128i'/>
+	<parameter varname='base_addr' type='int const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='mask' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+		mask[i+31] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpgatherdd' form='xmm, vm32x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mask_i32gather_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m256i'/>
+	<parameter varname='base_addr' type='int const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='mask' type='__m256i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+		mask[i+31] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpgatherdd' form='ymm, vm32x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_mask_i32gather_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m128i'/>
+	<parameter varname='base_addr' type='__int64 const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='mask' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+		mask[i+63] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpgatherdq' form='xmm, vm32x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mask_i32gather_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m256i'/>
+	<parameter varname='base_addr' type='__int64 const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='mask' type='__m256i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF mask[i+63]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+		mask[i+63] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpgatherdq' form='ymm, vm32x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128d' name='_mm_mask_i64gather_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m128d'/>
+	<parameter varname='base_addr' type='double const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='mask' type='__m128d'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+		mask[i+63] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgatherqpd' form='xmm, vm64x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_mask_i64gather_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m256d'/>
+	<parameter varname='base_addr' type='double const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='mask' type='__m256d'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+		mask[i+63] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vgatherqpd' form='ymm, vm64x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128' name='_mm_mask_i64gather_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m128'/>
+	<parameter varname='base_addr' type='float const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='mask' type='__m128'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+		mask[i+31] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:64] := 0
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vgatherqps' form='xmm, vm64x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128' name='_mm256_mask_i64gather_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m128'/>
+	<parameter varname='base_addr' type='float const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='mask' type='__m128'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+		mask[i+31] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgatherqps' form='ymm, vm64x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_mask_i64gather_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m128i'/>
+	<parameter varname='base_addr' type='int const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='mask' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+		mask[i+31] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:64] := 0
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vpgatherqd' form='xmm, vm64x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm256_mask_i64gather_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m128i'/>
+	<parameter varname='base_addr' type='int const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='mask' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF mask[i+31]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+		mask[i+31] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpgatherqd' form='ymm, vm64x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_mask_i64gather_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m128i'/>
+	<parameter varname='base_addr' type='__int64 const*'/>
+	<parameter varname='vindex' type='__m128i'/>
+	<parameter varname='mask' type='__m128i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+		mask[i+63] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:128] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpgatherqq' form='xmm, vm64x, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mask_i64gather_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='src' type='__m256i'/>
+	<parameter varname='base_addr' type='__int64 const*'/>
+	<parameter varname='vindex' type='__m256i'/>
+	<parameter varname='mask' type='__m256i'/>
+	<parameter varname='scale' type='const int'/>
+	<description>
+	Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
+		mask[i+63] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+mask[MAX:256] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpgatherqq' form='ymm, vm64x, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_maskload_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='int const*'/>
+	<parameter varname='mask' type='__m128i'/>
+	<description>Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmaskmovd' form='xmm, xmm, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_maskload_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='int const*'/>
+	<parameter varname='mask' type='__m256i'/>
+	<description>Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmaskmovd' form='ymm, ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_maskload_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='__int64 const*'/>
+	<parameter varname='mask' type='__m128i'/>
+	<description>Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmaskmovq' form='xmm, xmm, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_maskload_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='__int64 const*'/>
+	<parameter varname='mask' type='__m256i'/>
+	<description>Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmaskmovq' form='ymm, ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='void' name='_mm_maskstore_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='int*'/>
+	<parameter varname='mask' type='__m128i'/>
+	<parameter varname='a' type='__m128i'/>
+	<description>Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmaskmovd' form='m128, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='void' name='_mm256_maskstore_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='int*'/>
+	<parameter varname='mask' type='__m256i'/>
+	<parameter varname='a' type='__m256i'/>
+	<description>Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF mask[i+31]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmaskmovd' form='m256, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='void' name='_mm_maskstore_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__int64*'/>
+	<parameter varname='mask' type='__m128i'/>
+	<parameter varname='a' type='__m128i'/>
+	<description>Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmaskmovq' form='m128, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='void' name='_mm256_maskstore_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='__int64*'/>
+	<parameter varname='mask' type='__m256i'/>
+	<parameter varname='a' type='__m256i'/>
+	<description>Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF mask[i+63]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmaskmovq' form='m256, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF a[i+7:i] &gt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmaxsb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF a[i+15:i] &gt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmaxsw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF a[i+31:i] &gt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmaxsd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epu8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF a[i+7:i] &gt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmaxub' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epu16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF a[i+15:i] &gt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmaxuw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epu32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF a[i+31:i] &gt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmaxud' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF a[i+7:i] &lt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpminsb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF a[i+15:i] &lt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpminsw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF a[i+31:i] &lt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpminsd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epu8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF a[i+7:i] &lt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpminub' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epu16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF a[i+15:i] &lt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpminuw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epu32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF a[i+31:i] &lt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpminud' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='int' name='_mm256_movemask_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m256i'/>
+	<description>
+Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[j] := a[i+7]
+ENDFOR
+	</operation>
+	<instruction name='vpmovmskb' form='r32, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mpsadbw_epu8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Eight SADs are performed for each 128-bit lane using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8".</description>
+	<operation>
+MPSADBW(a[127:0], b[127:0], imm8[2:0]) {
+	a_offset := imm8[2]*32
+	b_offset := imm8[1:0]*32
+	FOR j := 0 to 7
+		i := j*8
+		k := a_offset+i
+		l := b_offset
+		tmp[i+15:i] := ABS(a[k+7:k] - b[l+7:l]) + ABS(a[k+15:k+8] - b[l+15:l+8]) + ABS(a[k+23:k+16] - b[l+23:l+16]) + ABS(a[k+31:k+24] - b[l+31:l+24])
+	ENDFOR
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])
+dst[255:128] := MPSADBW(a[255:128], b[255:128], imm8[5:3])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmpsadbw' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mul_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmuldq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mul_epu32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmuludq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mulhi_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmulhw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mulhi_epu16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmulhuw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mulhrs_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmulhrsw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mullo_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmullw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mullo_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	tmp[63:0] := a[i+31:i] * b[i+31:i]
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmulld' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_or_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compute the bitwise OR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := (a[255:0] OR b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpor' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_packs_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".
+	</description>
+	<operation>
+dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
+dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
+dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
+dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
+dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
+dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
+dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
+dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
+dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
+dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
+dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
+dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
+dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
+dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
+dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
+dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
+dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
+dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
+dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
+dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
+dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
+dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
+dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
+dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
+dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
+dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
+dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
+dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
+dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
+dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
+dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
+dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpacksswb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_packs_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
+dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
+dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
+dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
+dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
+dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
+dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
+dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
+dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
+dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
+dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
+dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
+dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
+dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
+dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
+dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpackssdw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_packus_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
+dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
+dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
+dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
+dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
+dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
+dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
+dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
+dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
+dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
+dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
+dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
+dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
+dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
+dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
+dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
+dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
+dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
+dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
+dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
+dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
+dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
+dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
+dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
+dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
+dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
+dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
+dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
+dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
+dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
+dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
+dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpackuswb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_packus_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
+dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
+dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
+dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
+dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
+dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
+dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
+dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
+dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
+dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
+dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
+dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
+dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
+dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
+dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
+dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpackusdw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_permute2x128_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". </description>
+	<operation>
+SELECT4(src1, src2, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src1[127:0]
+	1:	tmp[127:0] := src1[255:128]
+	2:	tmp[127:0] := src2[127:0]
+	3:	tmp[127:0] := src2[255:128]
+	ESAC
+	IF control[3]
+		tmp[127:0] := 0
+	FI
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
+dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vperm2i128' form='ymm, ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_permute4x64_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpermq' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_permute4x64_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpermpd' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_permutevar8x32_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='idx' type='__m256i'/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpermd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256' name='_mm256_permutevar8x32_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='idx' type='__m256i'/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpermps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sad_epu8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+FOR j := 0 to 4
+	i := j*64
+	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
+	dst[i+63:i+16] := 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsadbw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_shuffle_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpshufd' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_shuffle_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Shuffle 8-bit integers in "a" within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[3:0] := b[i+3:i]
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+	IF b[128+i+7] == 1
+		dst[128+i+7:128+i] := 0
+	ELSE
+		index[3:0] := b[128+i+3:128+i]
+		dst[128+i+7:128+i] := a[128+index*8+7:128+index*8]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpshufb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_shufflehi_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
+dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
+dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
+dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
+dst[191:128] := a[191:128]
+dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
+dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
+dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
+dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpshufhw' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_shufflelo_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
+dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
+dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
+dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
+dst[127:64] := a[127:64]
+dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
+dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
+dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
+dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
+dst[255:192] := a[255:192]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpshuflw' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sign_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF b[i+7:i] &lt; 0
+		dst[i+7:i] := NEG(a[i+7:i])
+	ELSE IF b[i+7:i] = 0
+		dst[i+7:i] := 0
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsignb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sign_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF b[i+15:i] &lt; 0
+		dst[i+15:i] := NEG(a[i+15:i])
+	ELSE IF b[i+15:i] = 0
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsignw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sign_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF b[i+31:i] &lt; 0
+		dst[i+31:i] := NEG(a[i+31:i])
+	ELSE IF b[i+31:i] = 0
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsignd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_slli_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+dst[255:128] := a[255:128] &lt;&lt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpslldq' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_bslli_epi128'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+dst[255:128] := a[255:128] &lt;&lt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpslldq' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sll_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsllw' form='ymm, ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_slli_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsllw' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sll_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpslld' form='ymm, ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_slli_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpslld' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sll_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+		<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsllq' form='ymm, ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_slli_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsllq' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_sllv_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpsllvd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sllv_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m256i'/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsllvd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_sllv_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpsllvq' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sllv_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m256i'/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsllvq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sra_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := SignBit
+	ELSE
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsraw' form='ymm, ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srai_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := SignBit
+	ELSE
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsraw' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sra_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := SignBit
+	ELSE
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrad' form='ymm, ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srai_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := SignBit
+	ELSE
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrad' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_srav_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpsravd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srav_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m256i'/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsravd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srli_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+dst[255:128] := a[255:128] &gt;&gt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrldq' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_bsrli_epi128'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='const int'/>
+	<description>Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+dst[255:128] := a[255:128] &gt;&gt; (tmp*8)
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrldq' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srl_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrlw' form='ymm, ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srli_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrlw' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srl_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrld' form='ymm, ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srli_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrld' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srl_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrlq' form='ymm, ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srli_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname="imm8" type='int'/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrlq' form='ymm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_srlv_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpsrlvd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srlv_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m256i'/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrlvd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m128i' name='_mm_srlv_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='count' type='__m128i'/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpsrlvq' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srlv_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Shift</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='count' type='__m256i'/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsrlvq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_stream_load_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='__m256i const*'/>
+	<description>Load 256-bits of integer data from memory into "dst" using a non-temporal memory hint.
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[255:0] := MEM[mem_addr+255:mem_addr]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vmovntdqa' form='ymm, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sub_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsubb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sub_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsubw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sub_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsubd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sub_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsubq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_subs_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsubsb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_subs_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsubsw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_subs_epu8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsubusb' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_subs_epu16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpsubusw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_xor_si256'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Logical</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Compute the bitwise XOR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[255:0] := (a[255:0] XOR b[255:0])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpxor' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpackhi_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpunpckhbw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpackhi_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+
+dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpunpckhwd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpackhi_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpunpckhdq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpackhi_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpunpckhqdq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpacklo_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}
+
+dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpunpcklbw' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpacklo_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpunpcklwd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpacklo_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpunpckldq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpacklo_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Swizzle</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpunpcklqdq' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmadd_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='c' type='__m128d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmadd213pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmadd231pd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fmadd_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<parameter varname='c' type='__m256d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfmadd132pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfmadd213pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfmadd231pd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128' name='_mm_fmadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='c' type='__m128'/>
+		<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfmadd213ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfmadd231ps' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256' name='_mm256_fmadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname='c' type='__m256'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfmadd132ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfmadd213ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfmadd231ps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmadd_sd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='c' type='__m128d'/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132sd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmadd213sd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmadd231sd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128' name='_mm_fmadd_ss'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='c' type='__m128'/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132ss' form='xmm, xmm, xmm'/>
+	<instruction name='vfmadd213ss' form='xmm, xmm, xmm'/>
+	<instruction name='vfmadd231ss' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmaddsub_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='c' type='__m128d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF (j is even) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmaddsub132pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmaddsub213pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmaddsub231pd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fmaddsub_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<parameter varname='c' type='__m256d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF (j is even) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfmaddsub132pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfmaddsub213pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfmaddsub231pd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128' name='_mm_fmaddsub_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='c' type='__m128'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF (j is even) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmaddsub132ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfmaddsub213ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfmaddsub231ps' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256' name='_mm256_fmaddsub_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname='c' type='__m256'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF (j is even) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfmaddsub132ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfmaddsub213ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfmaddsub231ps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmsub_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='c' type='__m128d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsub213pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsub231pd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fmsub_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<parameter varname='c' type='__m256d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfmsub132pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfmsub213pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfmsub231pd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128' name='_mm_fmsub_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='c' type='__m128'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsub213ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsub231ps' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256' name='_mm256_fmsub_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname='c' type='__m256'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfmsub132ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfmsub213ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfmsub231ps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmsub_sd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='c' type='__m128d'/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132sd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsub213sd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsub231sd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128' name='_mm_fmsub_ss'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='c' type='__m128'/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132ss' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsub213ss' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsub231ss' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmsubadd_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='c' type='__m128d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF (j is even) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsubadd132pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsubadd213pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsubadd231pd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fmsubadd_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<parameter varname='c' type='__m256d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF (j is even) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfmsubadd132pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfmsubadd213pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfmsubadd231pd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128' name='_mm_fmsubadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='c' type='__m128'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF (j is even) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsubadd132ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsubadd213ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfmsubadd231ps' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256' name='_mm256_fmsubadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname='c' type='__m256'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF (j is even) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfmsubadd132ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfmsubadd213ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfmsubadd231ps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128d' name='_mm_fnmadd_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='c' type='__m128d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmadd213pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmadd231pd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fnmadd_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<parameter varname='c' type='__m256d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfnmadd132pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfnmadd213pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfnmadd231pd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128' name='_mm_fnmadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='c' type='__m128'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmadd213ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmadd231ps' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256' name='_mm256_fnmadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname='c' type='__m256'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfnmadd132ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfnmadd213ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfnmadd231ps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128d' name='_mm_fnmadd_sd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='c' type='__m128d'/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132sd' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmadd213sd' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmadd231sd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128' name='_mm_fnmadd_ss'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='c' type='__m128'/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132ss' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmadd213ss' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmadd231ss' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128d' name='_mm_fnmsub_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='c' type='__m128d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmsub213pd' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmsub231pd' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fnmsub_pd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<parameter varname='c' type='__m256d'/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfnmsub132pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfnmsub213pd' form='ymm, ymm, ymm'/>
+	<instruction name='vfnmsub231pd' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128' name='_mm_fnmsub_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='c' type='__m128'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmsub213ps' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmsub231ps' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m256' name='_mm256_fnmsub_ps'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<parameter varname='c' type='__m256'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vfnmsub132ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfnmsub213ps' form='ymm, ymm, ymm'/>
+	<instruction name='vfnmsub231ps' form='ymm, ymm, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128d' name='_mm_fnmsub_sd'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<parameter varname='c' type='__m128d'/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132sd' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmsub213sd' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmsub231sd' form='xmm, xmm, xmm'/>
+	<perfdata arch='Haswell' lat='5' tpt='0.5'/> b
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='FMA' rettype='__m128' name='_mm_fnmsub_ss'>
+	<type>Floating Point</type>
+	<CPUID>FMA</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<parameter varname='c' type='__m128'/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132ss' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmsub213ss' form='xmm, xmm, xmm'/>
+	<instruction name='vfnmsub231ss' form='xmm, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_bextr_u32'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<parameter type='unsigned int' varname='start' />
+	<parameter type='unsigned int' varname='len' />
+	<description>Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start".</description>
+	<operation>
+tmp := ZERO_EXTEND_TO_512(a)
+dst := ZERO_EXTEND(tmp[start+len-1:start])
+	</operation>
+	<instruction name='bextr' form='r32, r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_bextr2_u32'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<parameter type='unsigned int' varname='control' />
+	<description>Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control".</description>
+	<operation>
+start := control[7:0]
+len := control[15:8]
+tmp := ZERO_EXTEND_TO_512(a)
+dst := ZERO_EXTEND(tmp[start+len-1:start])
+	</operation>
+	<instruction name='bextr' form='r32, r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_bextr_u64'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<parameter type='unsigned int' varname='start' />
+	<parameter type='unsigned int' varname='len' />
+	<description>Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start".</description>
+	<operation>
+tmp := ZERO_EXTEND_TO_512(a)
+dst := ZERO_EXTEND(tmp[start+len-1:start])
+	</operation>
+	<instruction name='bextr' form='r64, r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_bextr2_u64'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<parameter type='unsigned __int64' varname='control' />
+	<description>Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control"..</description>
+	<operation>
+start := control[7:0]
+len := control[15:8]
+tmp := ZERO_EXTEND_TO_512(a)
+dst := ZERO_EXTEND(tmp[start+len-1:start])
+	</operation>
+	<instruction name='bextr' form='r64, r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_blsi_u32'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<description>Extract the lowest set bit from unsigned 32-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a".</description>
+	<operation>
+dst := (-a) BITWISE AND a
+	</operation>
+	<instruction name='blsi' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_blsi_u64'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<description>Extract the lowest set bit from unsigned 64-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a".</description>
+	<operation>
+dst := (-a) BITWISE AND a
+	</operation>
+	<instruction name='blsi' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_blsmsk_u32'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<description>Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 32-bit integer "a".</description>
+	<operation>
+dst := (a - 1) XOR a
+	</operation>
+	<instruction name='blsmsk' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_blsmsk_u64'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<description>Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 64-bit integer "a".</description>
+	<operation>
+dst := (a - 1) XOR a
+	</operation>
+	<instruction name='blsmsk' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_blsr_u32'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<description>Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a".</description>
+	<operation>
+dst := (a - 1) BITWISE AND a
+	</operation>
+	<instruction name='blsr' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_blsr_u64'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<description>Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a".</description>
+	<operation>
+dst := (a - 1) BITWISE AND a
+	</operation>
+	<instruction name='blsr' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_bzhi_u32'>
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<parameter type='unsigned int' varname='index' />
+	<description>Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index".</description>
+	<operation>
+n := index[7:0]
+dst := a
+IF (n &lt; 32)
+	dst[31:n] := 0
+FI
+	</operation>
+	<instruction name='bzhi' form='r32, r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_bzhi_u64'>
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<parameter type='unsigned int' varname='index' />
+	<description>Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index".</description>
+	<operation>
+n := index[7:0]
+dst := a
+IF (n &lt; 64)
+	dst[63:n] := 0
+FI
+	</operation>
+	<instruction name='bzhi' form='r64, r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_invpcid'>
+	<CPUID>INVPCID</CPUID>
+	<category>OS-Targeted</category>
+	<parameter type='unsigned int' varname='type' />
+	<parameter type='void*' varname='descriptor' />
+	<description>
+	Invalidate mappings in the Translation Lookaside Buffers (TLBs) and paging-structure caches for the processor context identifier (PCID) specified by "descriptor" based on the invalidation type specified in "type". 
+	The PCID "descriptor" is specified as a 16-byte memory operand (with no alignment restrictions) where bits [11:0] specify the PCID, and bits [127:64] specify the linear address; bits [63:12] are reserved.
+	The types supported are:
+		0) Individual-address invalidation: If "type" is 0, the logical processor invalidates mappings for a single linear address and tagged with the PCID specified in "descriptor", except global translations. The instruction may also invalidate global translations, mappings for other linear addresses, or mappings tagged with other PCIDs.
+		1) Single-context invalidation: If "type" is 1, the logical processor invalidates all mappings tagged with the PCID specified in "descriptor" except global translations. In some cases, it may invalidate mappings for other PCIDs as well.
+		2) All-context invalidation: If "type" is 2, the logical processor invalidates all mappings tagged with any PCID.
+		3) All-context invalidation, retaining global translations: If "type" is 3, the logical processor invalidates all mappings tagged with any PCID except global translations, ignoring "descriptor". The instruction may also invalidate global translations as well.
+	</description>
+	<operation>
+CASE type OF
+0: // individual-address invalidation retaining global translations
+	OP_PCID := descriptor[11:0]
+	ADDR := descriptor[127:64]
+	BREAK
+1: // single PCID invalidation retaining globals
+	OP_PCID := descriptor[11:0]
+	// invalidate all mappings tagged with OP_PCID except global translations
+	BREAK
+2: // all PCID invalidation
+	// invalidate all mappings tagged with any PCID
+	BREAK
+3: // all PCID invalidation retaining global translations
+	// invalidate all mappings tagged with any PCID except global translations
+	BREAK
+ESAC
+	</operation>
+	<instruction name='invpcid' form='r32, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_lzcnt_u32'>
+	<type>Integer</type>
+	<CPUID>LZCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<description>Count the number of leading zero bits in unsigned 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 31
+dst := 0
+DO WHILE (tmp &gt;= 0 AND a[tmp] = 0)
+	tmp := tmp - 1
+	dst := dst + 1
+OD	
+	</operation>
+	<instruction name='lzcnt' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_lzcnt_u64'>
+	<type>Integer</type>
+	<CPUID>LZCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<description>Count the number of leading zero bits in unsigned 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 63
+dst := 0
+DO WHILE (tmp &gt;= 0 AND a[tmp] = 0)
+	tmp := tmp - 1
+	dst := dst + 1
+OD	
+	</operation>
+	<instruction name='lzcnt' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_pdep_u32'>
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<parameter type='unsigned int' varname='mask' />
+	<description>Deposit contiguous low bits from unsigned 32-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 32
+	IF mask[m] = 1
+		dst[m] := tmp[k]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction name='pdep' form='r32, r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_pdep_u64'>
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<parameter type='unsigned __int64' varname='mask' />
+	<description>Deposit contiguous low bits from unsigned 64-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 64
+	IF mask[m] = 1
+		dst[m] := tmp[k]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction name='pdep' form='r64, r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_pext_u32'>
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<parameter type='unsigned int' varname='mask' />
+	<description>Extract bits from unsigned 32-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 32
+	IF mask[m] = 1
+		dst[k] := tmp[m]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction name='pext' form='r32, r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_pext_u64'>
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<parameter type='unsigned __int64' varname='mask' />
+	<description>Extract bits from unsigned 64-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero.</description>
+	<operation>
+tmp := a
+dst := 0
+m := 0
+k := 0
+DO WHILE m &lt; 64
+	IF mask[m] = 1
+		dst[k] := tmp[m]
+		k := k + 1
+	FI
+	m := m + 1
+OD
+	</operation>
+	<instruction name='pext' form='r64, r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_andn_u32'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<parameter type='unsigned int' varname='b' />
+	<description>Compute the bitwise NOT of 32-bit integer "a" and then AND with b, and store the results in dst.</description>
+	<operation>
+dst[31:0] := ((NOT a[31:0]) AND b[31:0])
+	</operation>
+	<instruction name='andn' form='r32, r32' />
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_andn_u64'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<parameter type='unsigned __int64' varname='b' />
+	<description>Compute the bitwise NOT of 64-bit integer "a" and then AND with b, and store the results in dst.</description>
+	<operation>
+dst[63:0] := ((NOT a[63:0]) AND b[63:0])
+	</operation>
+	<instruction name='andn' form='r64, r64' />
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_mulx_u32'>
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Arithmetic</category>
+	<parameter type='unsigned int' varname='a' />
+	<parameter type='unsigned int' varname='b' />
+	<parameter type='unsigned int*' varname='hi' />
+	<description>Multiply unsigned 32-bit integers "a" and "b", store the low 32-bits of the result in "dst", and store the high 32-bits in "hi". This does not read or write arithmetic flags.</description>
+	<operation>
+dst[31:0] := (a * b)[31:0]
+hi[31:0] := (a * b)[63:32]
+	</operation>
+	<instruction name='mulx' form='r32, r32, m32' />
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_mulx_u64'>
+	<type>Integer</type>
+	<CPUID>BMI2</CPUID>
+	<category>Arithmetic</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<parameter type='unsigned __int64' varname='b' />
+	<parameter type='unsigned __int64*' varname='hi' />
+	<description>Multiply unsigned 64-bit integers "a" and "b", store the low 64-bits of the result in "dst", and store the high 64-bits in "hi". This does not read or write arithmetic flags.</description>
+	<operation>
+dst[63:0] := (a * b)[63:0]
+hi[63:0] := (a * b)[127:64]
+	</operation>
+	<instruction name='mulx' form='r64, r64, m64' />
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_tzcnt_u32'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<description>Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 32) AND a[tmp] = 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD	
+	</operation>
+	<instruction name='tzcnt' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_tzcnt_u64'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<description>Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 64) AND a[tmp] = 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD	
+	</operation>
+	<instruction name='tzcnt' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='void' name='_xabort'>
+	<CPUID>RTM</CPUID>
+	<category>General Support</category>
+	<parameter type='const unsigned int' varname="imm8" />
+	<description>
+	Force an RTM abort. The EAX register is updated to reflect an XABORT instruction caused the abort, and the "imm8" parameter will be provided in bits [31:24] of EAX.
+	Following an RTM abort, the logical processor resumes execution at the fallback address computed through the outermost XBEGIN instruction. 
+	</description>
+	<operation>
+IF RTM_ACTIVE = 0
+	// nop
+ELSE
+	// restore architectural register state
+	// discard memory updates performed in transaction
+	// update EAX with status and imm8 value
+	RTM_NEST_COUNT := 0
+	RTM_ACTIVE := 0
+	IF 64-bit Mode
+		RIP := fallbackRIP
+	ELSE
+		EIP := fallbackEIP
+	FI
+FI
+	</operation>
+	<instruction name='xabort' form='imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='unsigned int' name='_xbegin'>
+	<CPUID>RTM</CPUID>
+	<category>General Support</category>
+	<parameter type='void' varname='' />
+	<description>
+	Specify the start of an RTM code region. 
+	If the logical processor was not already in transactional execution, then this call causes the logical processor to transition into transactional execution. 
+	On an RTM abort, the logical processor discards all architectural register and memory updates performed during the RTM execution, restores architectural state, and starts execution beginning at the fallback address computed from the outermost XBEGIN instruction. Return status of ~0 (0xFFFF) if continuing inside transaction; all other codes are aborts.
+	</description>
+	<operation>
+IF RTM_NEST_COUNT &lt; MAX_RTM_NEST_COUNT
+	RTM_NEST_COUNT := RTM_NEST_COUNT + 1
+	IF RTM_NEST_COUNT = 1
+		IF 64-bit Mode
+			fallbackRIP := RIP + SignExtend(IMM)
+		ELSE IF 32-bit Mode
+			fallbackEIP := EIP + SignExtend(IMM)
+		ELSE // 16-bit Mode
+			fallbackEIP := (EIP + SignExtend(IMM)) AND 0x0000FFFF
+		FI
+		
+		RTM_ACTIVE := 1
+		// enter RTM execution, record register state, start tracking memory state
+	FI
+ELSE
+	// RTM abort (see _xabort)
+FI			
+	</operation>
+	<instruction name='xbegin' form=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='void' name='_xend'>
+	<CPUID>RTM</CPUID>
+	<category>General Support</category>
+	<parameter type='void' varname='' />
+	<description>
+	Specify the end of an RTM code region.
+	If this corresponds to the outermost scope, the logical processor will attempt to commit the logical processor state atomically. 
+	If the commit fails, the logical processor will perform an RTM abort.
+	</description>
+	<operation>
+IF RTM_ACTIVE = 1
+	RTM_NEST_COUNT := RTM_NEST_COUNT - 1
+	IF RTM_NEST_COUNT = 0
+		// try to commit transaction
+		IF fail to commit transaction
+			// RTM abort (see _xabort)
+		ELSE
+			RTM_ACTIVE = 0
+		FI
+	FI
+FI
+	</operation>
+	<instruction name='xend' form=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='unsigned char' name='_xtest'>
+	<CPUID>RTM</CPUID>
+	<category>General Support</category>
+	<parameter type='void' varname='' />
+	<description>Query the transactional execution status, return 0 if inside a transactionally executing RTM or HLE region, and return 1 otherwise.</description>
+	<operation>
+IF (RTM_ACTIVE = 1 OR HLE_ACTIVE = 1)
+	dst := 0
+ELSE
+	dst := 1
+FI
+	</operation>
+	<instruction name='xtest' form=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='unsigned __int64' name='__rdtscp'>
+	<CPUID>RDTSCP</CPUID>
+	<category>General Support</category>
+	<parameter varname='mem_addr' type='unsigned int *'/>
+	<description>Copy the current 64-bit value of the processor's time-stamp counter into "dst", and store the IA32_TSC_AUX MSR (signature value) into memory at "mem_addr".</description>
+	<operation>
+dst[63:0] := TimeStampCounter
+MEM[mem_addr+31:mem_addr] := IA32_TSC_AUX[31:0]
+	</operation>
+	<instruction name='rdtscp' form=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='unsigned int' name='_rdpid_u32'>
+	<CPUID>RDPID</CPUID>
+	<category>General Support</category>
+	<parameter varname="" type="void"/>
+	<description>Copy the IA32_TSC_AUX MSR (signature value) into "dst".</description>
+	<operation>
+dst[31:0] := IA32_TSC_AUX[31:0]
+	</operation>
+	<instruction name='rdpid' form=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='int' name='_bit_scan_forward'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='int'/>
+	<description>Set "dst" to the index of the lowest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined.</description>
+	<operation>
+tmp := 0
+IF a = 0
+	dst := undefined
+ELSE
+	DO WHILE ((tmp &lt; 32) AND a[tmp] = 0)
+		tmp := tmp + 1
+		dst := tmp
+	OD
+FI
+	</operation>
+	<instruction name='bsf' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='int' name='_bit_scan_reverse'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='int'/>
+	<description>Set "dst" to the index of the highest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined.</description>
+	<operation>
+tmp := 31
+IF a = 0
+	dst := undefined
+ELSE
+	DO WHILE ((tmp &gt; 0) AND a[tmp] = 0)
+		tmp := tmp - 1
+		dst := tmp
+	OD
+FI
+	</operation>
+	<instruction name='bsr' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_BitScanForward'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='index' type='unsigned __int32*'/>
+	<parameter varname='mask' type='unsigned __int32'/>
+	<description>Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "mask", then set "dst" to 0, otherwise set "dst" to 1.</description>
+	<operation>
+tmp := 0
+IF mask = 0
+	dst := 0
+ELSE
+	DO WHILE ((tmp &lt; 32) AND mask[tmp] = 0)
+		tmp := tmp + 1
+		index := tmp
+		dst := 1
+	OD
+FI
+	</operation>
+	<instruction name='bsf' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_BitScanReverse'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='index' type='unsigned __int32*'/>
+	<parameter varname='mask' type='unsigned __int32'/>
+	<description>Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "mask", then set "dst" to 0, otherwise set "dst" to 1.</description>
+	<operation>
+tmp := 31
+IF mask = 0
+	dst := 0
+ELSE
+	DO WHILE ((tmp &gt; 0) AND mask[tmp] = 0)
+		tmp := tmp - 1
+		index := tmp
+		dst := 1
+	OD
+FI
+	</operation>
+	<instruction name='bsr' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_BitScanForward64'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='index' type='unsigned __int32*'/>
+	<parameter varname='mask' type='unsigned __int64'/>
+	<description>Set "index" to the index of the lowest set bit in 64-bit integer "mask". If no bits are set in "mask", then set "dst" to 0, otherwise set "dst" to 1.</description>
+	<operation>
+tmp := 0
+IF mask = 0
+	dst := 0
+ELSE
+	DO WHILE ((tmp &lt; 64) AND mask[tmp] = 0)
+		tmp := tmp + 1
+		index := tmp
+		dst := 1
+	OD
+FI
+	</operation>
+	<instruction name='bsf' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_BitScanReverse64'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='index' type='unsigned __int32*'/>
+	<parameter varname='mask' type='unsigned __int64'/>
+	<description>Set "index" to the index of the highest set bit in 64-bit integer "mask". If no bits are set in "mask", then set "dst" to 0, otherwise set "dst" to 1.</description>
+	<operation>
+tmp := 31
+IF mask = 0
+	dst := 0
+ELSE
+	DO WHILE ((tmp &gt; 0) AND mask[tmp] = 0)
+		tmp := tmp - 1
+		index := tmp
+		dst := 1
+	OD
+FI
+	</operation>
+	<instruction name='bsr' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_bittest'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='__int32*'/>
+	<parameter varname='b' type='__int32'/>
+	<description>Return the bit at index "b" of 32-bit integer "a".</description>
+	<operation>
+dst := a[b]
+	</operation>
+	<instruction name='bt' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_bittestandcomplement'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='__int32*'/>
+	<parameter varname='b' type='__int32'/>
+	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to its complement.</description>
+	<operation>
+dst := a[b]
+a[b] := ~a[b]
+	</operation>
+	<instruction name='btc' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_bittestandreset'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='__int32*'/>
+	<parameter varname='b' type='__int32'/>
+	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to zero.</description>
+	<operation>
+dst := a[b]
+a[b] := 0
+	</operation>
+	<instruction name='btr' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_bittestandset'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='__int32*'/>
+	<parameter varname='b' type='__int32'/>
+	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to one.</description>
+	<operation>
+dst := a[b]
+a[b] := 1
+	</operation>
+	<instruction name='bts' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_bittest64'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='__int64*'/>
+	<parameter varname='b' type='__int64'/>
+	<description>Return the bit at index "b" of 64-bit integer "a".</description>
+	<operation>
+dst := a[b]
+	</operation>
+	<instruction name='bt' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_bittestandcomplement64'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='__int64*'/>
+	<parameter varname='b' type='__int64'/>
+	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to its complement.</description>
+	<operation>
+dst := a[b]
+a[b] := ~a[b]
+	</operation>
+	<instruction name='btc' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_bittestandreset64'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='__int64*'/>
+	<parameter varname='b' type='__int64'/>
+	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to zero.</description>
+	<operation>
+dst := a[b]
+a[b] := 0
+	</operation>
+	<instruction name='btr' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned char' name='_bittestandset64'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='__int64*'/>
+	<parameter varname='b' type='__int64'/>
+	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to one.</description>
+	<operation>
+dst := a[b]
+a[b] := 1
+	</operation>
+	<instruction name='bts' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='int' name='_bswap'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='int'/>
+	<description>Reverse the byte order of 32-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values.</description>
+	<operation>
+dst[7:0] := a[31:24]
+dst[15:8] := a[23:16]
+dst[23:16] := a[15:8]
+dst[31:24] := a[7:0]
+	</operation>
+	<instruction name='bswap' form='r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='__int64' name='_bswap64'>
+	<type>Integer</type>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='__int64'/>
+	<description>Reverse the byte order of 64-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values.</description>
+	<operation>
+dst[7:0] := a[63:56]
+dst[15:8] := a[55:48]
+dst[23:16] := a[47:40]
+dst[31:24] := a[39:32]
+dst[39:32] := a[31:24]
+dst[47:40] := a[23:16]
+dst[55:48] := a[15:8]
+dst[63:56] := a[7:0]
+	</operation>
+	<instruction name='bswap' form='r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='unsigned __int32' name='_castf32_u32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<category>Cast</category>
+	<parameter varname='a' type='float'/>
+	<description>Cast from type float to type unsigned __int32 without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='unsigned __int64' name='_castf64_u64'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<category>Cast</category>
+	<parameter varname='a' type='double'/>
+	<description>Cast from type double to type unsigned __int64 without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='float' name='_castu32_f32'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<category>Cast</category>
+	<parameter varname='a' type='unsigned __int32'/>
+	<description>Cast from type unsigned __int32 to type float without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='double' name='_castu64_f64'>
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<category>Cast</category>
+	<parameter varname='a' type='unsigned __int64'/>
+	<description>Cast from type unsigned __int64 to type double without conversion.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_fxrstor'>
+	<CPUID>FXSR</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<description>Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary.</description>
+	<operation>
+(x87 FPU, MMX, XMM7-XMM0, MXCSR) := Load(MEM[mem_addr])
+	</operation>
+	<instruction name='fxrstor' form='MEMmfpxenv'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_fxrstor64'>
+	<CPUID>FXSR</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<description>Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE64 instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary.</description>
+	<operation>
+(x87 FPU, MMX, XMM7-XMM0, MXCSR) := Load(MEM[mem_addr])
+	</operation>
+	<instruction name='fxrstor64' form='MEMmfpxenv'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_fxsave'>
+	<CPUID>FXSR</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<description>Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The clayout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.</description>
+	<operation>
+MEM[mem_addr+511*8:mem_addr] := Fxsave(x87 FPU, MMX, XMM7-XMM0, MXCSR)
+	</operation>
+	<instruction name='fxsave' form='MEMmfpxenv'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_fxsave64'>
+	<CPUID>FXSR</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<description>Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.</description>
+	<operation>
+MEM[mem_addr+511*8:mem_addr] := Fxsave64(x87 FPU, MMX, XMM7-XMM0, MXCSR)
+	</operation>
+	<instruction name='fxsave64' form='MEMmfpxenv'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned long' name='_lrotl'>
+	<type>Integer</type>
+	<category>Shift</category>
+	<parameter varname='a' type='unsigned long'/>
+	<parameter varname='shift' type='int'/>
+	<description>Shift the bits of unsigned 64-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift BITWISE AND 63
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[63]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction name='rol' form='r64, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned long' name='_lrotr'>
+	<type>Integer</type>
+	<category>Shift</category>
+	<parameter varname='a' type='unsigned long'/>
+	<parameter varname='shift' type='int'/>
+	<description>Shift the bits of unsigned 64-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift BITWISE AND 63
+DO WHILE (count &gt; 0)
+	tmp[63] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp[63]
+	count := count - 1
+OD
+	</operation>
+	<instruction name='ror' form='r64, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+
+
+
+<intrinsic tech='Other' sequence='true' rettype='void' name='_allow_cpu_features'>
+	<category>General Support</category>
+	<parameter varname='a' type='unsigned __int64'/>
+	<description>Treat the processor-specific feature(s) specified in "a" as available. Multiple features may be OR'd together. See the valid feature flags below:</description>
+	<operation>
+_FEATURE_GENERIC_IA32
+_FEATURE_FPU
+_FEATURE_CMOV
+_FEATURE_MMX
+_FEATURE_FXSAVE
+_FEATURE_SSE
+_FEATURE_SSE2
+_FEATURE_SSE3
+_FEATURE_SSSE3
+_FEATURE_SSE4_1
+_FEATURE_SSE4_2
+_FEATURE_MOVBE
+_FEATURE_POPCNT
+_FEATURE_PCLMULQDQ
+_FEATURE_AES
+_FEATURE_F16C
+_FEATURE_AVX
+_FEATURE_RDRND
+_FEATURE_FMA
+_FEATURE_BMI
+_FEATURE_LZCNT
+_FEATURE_HLE
+_FEATURE_RTM
+_FEATURE_AVX2
+_FEATURE_KNCNI
+_FEATURE_AVX512F
+_FEATURE_ADX
+_FEATURE_RDSEED
+_FEATURE_AVX512ER
+_FEATURE_AVX512PF
+_FEATURE_AVX512CD
+_FEATURE_SHA
+_FEATURE_MPX
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' sequence='true' rettype='int' name='_may_i_use_cpu_feature'>
+	<category>General Support</category>
+	<parameter varname='a' type='unsigned __int64'/>
+	<description>Dynamically query the processor to determine if the processor-specific feature(s) specified in "a" are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This intrinsic does not check the processor vendor. See the valid feature flags below:</description>
+	<operation>
+_FEATURE_GENERIC_IA32
+_FEATURE_FPU
+_FEATURE_CMOV
+_FEATURE_MMX
+_FEATURE_FXSAVE
+_FEATURE_SSE
+_FEATURE_SSE2
+_FEATURE_SSE3
+_FEATURE_SSSE3
+_FEATURE_SSE4_1
+_FEATURE_SSE4_2
+_FEATURE_MOVBE
+_FEATURE_POPCNT
+_FEATURE_PCLMULQDQ
+_FEATURE_AES
+_FEATURE_F16C
+_FEATURE_AVX
+_FEATURE_RDRND
+_FEATURE_FMA
+_FEATURE_BMI
+_FEATURE_LZCNT
+_FEATURE_HLE
+_FEATURE_RTM
+_FEATURE_AVX2
+_FEATURE_KNCNI
+_FEATURE_AVX512F
+_FEATURE_ADX
+_FEATURE_RDSEED
+_FEATURE_AVX512ER
+_FEATURE_AVX512PF
+_FEATURE_AVX512CD
+_FEATURE_SHA
+_FEATURE_MPX
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_acos_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ACOS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_acos_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ACOS(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_acosh_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ACOSH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_acosh_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ACOSH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_asin_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ASIN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_asin_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ASIN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_asinh_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ASINH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_asinh_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ASINH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_atan_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_atan_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_atan2_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_atan2_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_atanh_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ATANH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_atanh_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ATANH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cbrt_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cbrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cdfnorm_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cdfnorm_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cdfnorminv_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cdfnorminv_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cexp_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the exponential value of "e" raised to the power of packed complex single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := e^(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_clog_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the natural logarithm of packed complex single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ln(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cos_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cos_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cosd_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := COSD(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cosd_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := COSD(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cosh_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := COSH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cosh_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := COSH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_csqrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the square root of packed complex single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 8*j
+	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epu8'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 8*j
+	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epu16'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epu32'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epu64'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_erf_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ERF(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_erf_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ERF(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_erfc_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_erfc_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := 1.0 - ERF(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_erfcinv_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_erfcinv_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_erfinv_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_erfinv_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := 1.0 / ERF(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_exp_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := e^(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_exp_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := e^(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_exp10_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := 10^(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_exp10_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := 10^(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_exp2_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := 2^(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_exp2_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := 2^(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_expm1_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := e^(a[i+63:i]) - 1.0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_expm1_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := e^(a[i+31:i]) - 1.0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_hypot_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_hypot_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_idiv_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_idivrem_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='mem_addr' type='__m128i *'/>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_invcbrt_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := InvCubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_invcbrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := InvCubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_invsqrt_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := InvSQRT(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_invsqrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := InvSQRT(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_irem_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_log_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ln(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_log_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ln(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_log10_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := log10(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_log10_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := log10(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_log1p_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ln(1.0 + a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_log1p_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ln(1.0 + a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_log2_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := log2(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_log2_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := log2(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_logb_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_logb_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_pow_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<parameter varname='b' type='__m128d'/>
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_pow_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<parameter varname='b' type='__m128'/>
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epi8'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epi16'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epi64'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epu8'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epu16'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epu32'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epu64'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_sin_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_sin_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_sincos_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='mem_addr' type='__m128d *'/>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+	MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_sincos_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='mem_addr' type='__m128 *'/>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_sind_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SIND(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_sind_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SIND(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_sinh_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SINH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_sinh_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SINH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_svml_ceil_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_svml_ceil_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_svml_floor_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_svml_floor_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_svml_round_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_svml_round_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_svml_sqrt_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_svml_sqrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_tan_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TAN(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_tan_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TAN(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_tand_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TAND(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_tand_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TAND(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_tanh_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TANH(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_tanh_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TANH(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_trunc_pd'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := TRUNCATE(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_trunc_ps'>
+	<type>Floating Point</type>
+	<CPUID>SSE</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := TRUNCATE(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_udiv_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_udivrem_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='mem_addr' type='__m128i *'/>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_urem_epi32'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m128i'/>
+	<parameter varname='b' type='__m128i'/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_acos_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ACOS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_acos_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ACOS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_acosh_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ACOSH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_acosh_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ACOSH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_asin_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ASIN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_asin_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ASIN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_asinh_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ASINH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_asinh_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ASINH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_atan_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_atan_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_atan2_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_atan2_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_atanh_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ATANH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_atanh_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ATANH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_cbrt_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_cbrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_cdfnorm_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_cdfnorm_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_cdfnorminv_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_cdfnorminv_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_cexp_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the exponential value of "e" raised to the power of packed complex single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := e^(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_clog_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the natural logarithm of packed complex single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ln(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_cos_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_cos_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_cosd_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := COSD(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_cosd_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := COSD(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_cosh_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := COSH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_cosh_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := COSH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_csqrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the square root of packed complex single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_div_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 8*j
+	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_div_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_div_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_div_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_div_epu8'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 8*j
+	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_div_epu16'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_div_epu32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_div_epu64'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_erf_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ERF(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_erf_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ERF(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_erfc_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_erfc_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := 1.0 - ERF(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_erfcinv_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_erfcinv_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_erfinv_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_erfinv_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := 1.0 / ERF(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_exp_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := e^(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_exp_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := e^(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_exp10_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := 10^(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_exp10_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := 10^(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_exp2_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := 2^(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_exp2_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := 2^(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_expm1_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := e^(a[i+63:i]) - 1.0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_expm1_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := e^(a[i+31:i]) - 1.0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_hypot_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_hypot_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_idiv_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_idivrem_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='mem_addr' type='__m256i *'/>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_invcbrt_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := InvCubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_invcbrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := InvCubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_invsqrt_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := InvSQRT(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_invsqrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := InvSQRT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_irem_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_log_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ln(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_log_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ln(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_log10_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := log10(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_log10_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := log10(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_log1p_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ln(1.0 + a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_log1p_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ln(1.0 + a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_log2_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := log2(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_log2_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := log2(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_logb_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_logb_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_pow_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<parameter varname='b' type='__m256d'/>
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_pow_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='b' type='__m256'/>
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_rem_epi8'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_rem_epi16'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_rem_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_rem_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_rem_epu8'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_rem_epu16'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_rem_epu32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_rem_epu64'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_sin_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_sin_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_sincos_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='mem_addr' type='__m256d *'/>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+	MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_sincos_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='mem_addr' type='__m256 *'/>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_sind_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SIND(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_sind_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SIND(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_sinh_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SINH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_sinh_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SINH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_svml_ceil_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_svml_ceil_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_svml_floor_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_svml_floor_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_svml_round_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_svml_round_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_svml_sqrt_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_svml_sqrt_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_tan_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TAN(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_tan_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TAN(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_tand_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TAND(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_tand_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TAND(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_tanh_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TANH(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_tanh_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TANH(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256d' name='_mm256_trunc_pd'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := TRUNCATE(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256' name='_mm256_trunc_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := TRUNCATE(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_udiv_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_udivrem_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='mem_addr' type='__m256i *'/>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='SVML' sequence='true' rettype='__m256i' name='_mm256_urem_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='__m256i'/>
+	<parameter varname='b' type='__m256i'/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='int' name='_popcnt32'>
+	<type>Integer</type>
+	<CPUID>POPCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='int'/>
+	<description>
+		Count the number of bits set to 1 in 32-bit integer "a", and return that count in "dst". 
+	</description>
+	<operation>
+dst := 0
+FOR i := 0 to 31
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name='popcnt' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='int' name='_popcnt64'>
+	<type>Integer</type>
+	<CPUID>POPCNT</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname='a' type='__int64'/>
+	<description>
+		Count the number of bits set to 1 in 64-bit integer "a", and return that count in "dst". 
+	</description>
+	<operation>
+dst := 0
+FOR i := 0 to 63
+	IF a[i]
+		dst := dst + 1
+	FI
+ENDFOR
+	</operation>
+	<instruction name='popcnt' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='__int64' name='_rdpmc'>
+	<category>General Support</category>
+	<parameter varname='a' type='int'/>
+	<description>Read the Performance Monitor Counter (PMC) specified by "a", and store up to 64-bits in "dst". The width of performance counters is implementation specific.</description>
+	<operation>
+dst[63:0] := ReadPMC(a)
+	</operation>
+	<instruction name='rdpmc' form=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='__int64' name='_rdtsc'>
+	<CPUID>TSC</CPUID>
+	<category>General Support</category>
+	<parameter varname='' type='void'/>
+	<description>Copy the current 64-bit value of the processor's time-stamp counter into "dst".</description>
+	<operation>
+dst[63:0] := TimeStampCounter
+	</operation>
+	<instruction name='rdtsc' form=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_rotl'>
+	<type>Integer</type>
+	<category>Shift</category>
+	<parameter varname='a' type='unsigned int'/>
+	<parameter varname='shift' type='int'/>
+	<description>Shift the bits of unsigned 32-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift BITWISE AND 31
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[31]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction name='rol' form='r32, int'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_rotr'>
+	<type>Integer</type>
+	<category>Shift</category>
+	<parameter varname='a' type='unsigned int'/>
+	<parameter varname='shift' type='int'/>
+	<description>Shift the bits of unsigned 32-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift BITWISE AND 31
+DO WHILE (count &gt; 0)
+	tmp[31] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp[31]
+	count := count - 1
+OD
+	</operation>
+	<instruction name='ror' form='r32, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned short' name='_rotwl'>
+	<type>Integer</type>
+	<category>Shift</category>
+	<parameter varname='a' type='unsigned short'/>
+	<parameter varname='shift' type='int'/>
+	<description>Shift the bits of unsigned 16-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift BITWISE AND 15
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[15]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction name='rol' form='r16, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned short' name='_rotwr'>
+	<type>Integer</type>
+	<category>Shift</category>
+	<parameter varname='a' type='unsigned short'/>
+	<parameter varname='shift' type='int'/>
+	<description>Shift the bits of unsigned 16-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift BITWISE AND 15
+DO WHILE (count &gt; 0)
+	tmp[15] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp[15]
+	count := count - 1
+OD
+	</operation>
+	<instruction name='ror' form='r16, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_rotl64'>
+	<type>Integer</type>
+	<category>Shift</category>
+	<parameter varname='a' type='unsigned __int64'/>
+	<parameter varname='shift' type='int'/>
+	<description>Shift the bits of unsigned 64-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift BITWISE AND 63
+DO WHILE (count &gt; 0)
+	tmp[0] := dst[63]
+	dst := (dst &lt;&lt; 1) OR tmp[0]
+	count := count - 1
+OD
+	</operation>
+	<instruction name='rol' form='r64, int'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_rotr64'>
+	<type>Integer</type>
+	<category>Shift</category>
+	<parameter varname='a' type='unsigned __int64'/>
+	<parameter varname='shift' type='int'/>
+	<description>Shift the bits of unsigned 64-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
+	<operation>
+dst := a
+count := shift BITWISE AND 63
+DO WHILE (count &gt; 0)
+	tmp[63] := dst[0]
+	dst := (dst &gt;&gt; 1) OR tmp[63]
+	count := count - 1
+OD
+	</operation>
+	<instruction name='ror' form='r64, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_xgetbv'>
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='a' type='unsigned int'/>
+	<description>Copy up to 64-bits from the value of the extended control register (XCR) specified by "a" into "dst". Currently only XFEATURE_ENABLED_MASK XCR is supported.</description>
+	<operation>
+dst[63:0] := XCR[a]
+	</operation>
+	<instruction name='xgetbv' form=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xrstor'>
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<parameter varname='rs_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>
+st_mask = mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xrstor' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xrstor64'>
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<parameter varname='rs_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>
+st_mask = mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xrstor64' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xsave'>
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<parameter varname='save_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>
+mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xsave' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xsave64'>
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<parameter varname='save_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>
+mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xsave64' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xsaveopt'>
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEOPT</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<parameter varname='save_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE instruction.</description>
+	<operation>
+mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xsaveopt' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xsaveopt64'>
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEOPT</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<parameter varname='save_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE64 instruction.</description>
+	<operation>
+mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xsaveopt64' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xsetbv'>
+	<CPUID>XSAVE</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='a' type='unsigned int'/>
+	<parameter varname='val' type='unsigned __int64'/>
+	<description>Copy 64-bits from "val" to the extended control register (XCR) specified by "a". Currently only XFEATURE_ENABLED_MASK XCR is supported.</description>
+	<operation>
+XCR[a] := val[63:0]
+	</operation>
+	<instruction name='xsetbv' form=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+	
+	
+
+<intrinsic tech='Other' rettype='__m128i' name='_mm_loadu_si32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='void const*'/>
+	<description>Load unaligned 32-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[31:0] := MEM[mem_addr+31:mem_addr]
+dst[MAX:32] := 0
+	</operation>
+	<instruction name='movd' form='xmm, m32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='void' name='_mm_storeu_si32'>
+	<type>Integer</type>
+	<CPUID>SSE2</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='void*'/>
+	<parameter varname='a' type='__m128i'/>
+	<description>Store 32-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+31:mem_addr] := a[31:0]
+	</operation>
+	<instruction name='movd' form='m32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='void' name='_mm_storeu_si16' sequence='true'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='void*'/>
+	<parameter varname='a' type='__m128i'/>
+	<description>Store 16-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+15:mem_addr] := a[15:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='__m128i' name='_mm_loadu_si64'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='void const*'/>
+	<description>Load unaligned 64-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[63:0] := MEM[mem_addr+63:mem_addr]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='movq' form='xmm, m64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='void' name='_mm_storeu_si64'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Store</category>
+	<parameter varname='mem_addr' type='void*'/>
+	<parameter varname='a' type='__m128i'/>
+	<description>Store 64-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+63:mem_addr] := a[63:0]
+	</operation>
+	<instruction name='movq' form='m64, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='__m128i' name='_mm_loadu_si16' sequence='true'>
+	<type>Integer</type>
+	<CPUID>SSE</CPUID>
+	<category>Load</category>
+	<parameter varname='mem_addr' type='void const*'/>
+	<description>Load unaligned 16-bit integer from memory into the first element of "dst".</description>
+	<operation>
+dst[15:0] := MEM[mem_addr+15:mem_addr]
+dst[MAX:16] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_readfsbase_u32'>
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<description>Read the FS segment base register and store the 32-bit result in "dst".</description>
+	<operation>
+dst[31:0] := FS_Segment_Base_Register;
+dst[63:32] := 0
+	</operation>
+	<instruction name='rdfsbase' form='r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_readfsbase_u64'>
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<description>Read the FS segment base register and store the 64-bit result in "dst".</description>
+	<operation>
+dst[63:0] := FS_Segment_Base_Register;
+	</operation>
+	<instruction name='rdfsbase' form='r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned int' name='_readgsbase_u32'>
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<description>Read the GS segment base register and store the 32-bit result in "dst".</description>
+	<operation>
+dst[31:0] := GS_Segment_Base_Register;
+dst[63:32] := 0
+	</operation>
+	<instruction name='rdgsbase' form='r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='unsigned __int64' name='_readgsbase_u64'>
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<description>Read the GS segment base register and store the 64-bit result in "dst".</description>
+	<operation>
+dst[63:0] := GS_Segment_Base_Register;
+	</operation>
+	<instruction name='rdgsbase' form='r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='int' name='_rdrand16_step'>
+	<type>Integer</type>
+	<CPUID>RDRAND</CPUID>
+	<category>Random</category>
+	<parameter varname='val' type='unsigned short*'/>
+	<description>Read a hardware generated 16-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise.
+</description>
+	<operation>
+IF HW_RND_GEN.ready = 1
+	val[15:0] := HW_RND_GEN.data;
+	RETURN 1;
+ELSE
+	val[15:0] := 0;
+	RETURN 0;
+FI
+	</operation>
+	<instruction name='rdrand' form='r16'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='int' name='_rdrand32_step'>
+	<type>Integer</type>
+	<CPUID>RDRAND</CPUID>
+	<category>Random</category>
+	<parameter varname='val' type='unsigned int*'/>
+	<description>Read a hardware generated 32-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise.
+</description>
+	<operation>
+IF HW_RND_GEN.ready = 1
+	val[31:0] := HW_RND_GEN.data;
+	RETURN 1;
+ELSE
+	val[31:0] := 0;
+	RETURN 0;
+FI
+	</operation>
+	<instruction name='rdrand' form='r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='int' name='_rdrand64_step'>
+	<type>Integer</type>
+	<CPUID>RDRAND</CPUID>
+	<category>Random</category>
+	<parameter varname='val' type='unsigned __int64*'/>
+	<description>Read a hardware generated 64-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise.
+</description>
+	<operation>
+IF HW_RND_GEN.ready = 1
+	val[63:0] := HW_RND_GEN.data;
+	RETURN 1;
+ELSE
+	val[63:0] := 0;
+	RETURN 0;
+FI
+	</operation>
+	<instruction name='rdrand' form='r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_writefsbase_u32'>
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<parameter varname='a' type='unsigned int'/>
+	<description>Write the unsigned 32-bit integer "a" to the FS segment base register.</description>
+	<operation>
+FS_Segment_Base_Register[31:0] := a[31:0];
+FS_Segment_Base_Register[63:32] := 0
+	</operation>
+	<instruction name='wrfsbase' form='r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_writefsbase_u64'>
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<parameter varname='a' type='unsigned __int64'/>
+	<description>Write the unsigned 64-bit integer "a" to the FS segment base register.</description>
+	<operation>
+FS_Segment_Base_Register[63:0] := a[63:0];
+	</operation>
+	<instruction name='wrfsbase' form='r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_writegsbase_u32'>
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<parameter varname='a' type='unsigned int'/>
+	<description>Write the unsigned 32-bit integer "a" to the GS segment base register.</description>
+	<operation>
+GS_Segment_Base_Register[31:0] := a[31:0];
+GS_Segment_Base_Register[63:32] := 0
+	</operation>
+	<instruction name='wrgsbase' form='r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_writegsbase_u64'>
+	<type>Integer</type>
+	<CPUID>FSGSBASE</CPUID>
+	<category>General Support</category>
+	<parameter varname='a' type='unsigned __int64'/>
+	<description>Write the unsigned 64-bit integer "a" to the GS segment base register.</description>
+	<operation>
+GS_Segment_Base_Register[63:0] := a[63:0];
+	</operation>
+	<instruction name='wrgsbase' form='r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='__m256' name='_mm256_cvtph_ps'>
+	<type>Floating Point</type>
+	<CPUID>FP16C</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtph2ps' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='Other' rettype='__m128i' name='_mm256_cvtps_ph'>
+	<type>Floating Point</type>
+	<CPUID>FP16C</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m256'/>
+	<parameter varname='rounding' type='int'/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtps2ph' form='xmm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="int" name="_rdseed16_step">
+	<CPUID>RDSEED</CPUID>
+	<category>Random</category>
+	<parameter varname="val" type="unsigned short *"/>
+	<description>Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>
+IF HW_NRND_GEN.ready = 1 THEN
+	val[15:0] := HW_NRND_GEN.data
+	RETURN 1
+ELSE
+	val[15:0] := 0
+	RETURN 0
+FI
+	</operation>
+	<instruction name='rdseed' form='r16'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="int" name="_rdseed32_step">
+	<CPUID>RDSEED</CPUID>
+	<category>Random</category>
+	<parameter varname="val" type="unsigned int *"/>
+	<description>Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>
+IF HW_NRND_GEN.ready = 1 THEN
+	val[31:0] := HW_NRND_GEN.data
+	RETURN 1
+ELSE
+	val[31:0] := 0
+	RETURN 0
+FI
+	</operation>
+	<instruction name='rdseed' form='r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="int" name="_rdseed64_step">
+	<CPUID>RDSEED</CPUID>
+	<category>Random</category>
+	<parameter varname="val" type="unsigned __int64 *"/>
+	<description>Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise.</description>
+	<operation>
+IF HW_NRND_GEN.ready = 1 THEN
+	val[63:0] := HW_NRND_GEN.data
+	RETURN 1
+ELSE
+	val[63:0] := 0
+	RETURN 0
+FI
+	</operation>
+	<instruction name='rdseed' form='r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="unsigned char" name="_addcarry_u32">
+	<type>Integer</type>
+	<category>Arithmetic</category>
+	<parameter varname="c_in" type="unsigned char"/>
+	<parameter varname="a" type="unsigned int"/>
+	<parameter varname="b" type="unsigned int"/>
+	<parameter varname="out" type="unsigned int *"/>
+	<description>Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+dst:out[31:0] := a[31:0] + b[31:0] + c_in;
+	</operation>
+	<instruction name='adc' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="unsigned char" name="_addcarry_u64">
+	<type>Integer</type>
+	<category>Arithmetic</category>
+	<parameter varname="c_in" type="unsigned char"/>
+	<parameter varname="a" type="unsigned __int64"/>
+	<parameter varname="b" type="unsigned __int64"/>
+	<parameter varname="out" type="unsigned __int64 *"/>
+		<description>Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+dst:out[63:0] := a[63:0] + b[63:0] + c_in;
+	</operation>
+	<instruction name='adc' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="unsigned char" name="_subborrow_u32">
+	<type>Integer</type>
+	<category>Arithmetic</category>
+	<parameter varname="b_in" type="unsigned char"/>
+	<parameter varname="a" type="unsigned int"/>
+	<parameter varname="b" type="unsigned int"/>
+	<parameter varname="out" type="unsigned int *"/>
+	<description>Add unsigned 8-bit borrow "b_in" (carry flag) to unsigned 32-bit integer "b", and subtract the result from unsigned 32-bit integer "a". Store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+dst:out[31:0] := (a[31:0] - (b[31:0] + b_in));
+	</operation>
+	<instruction name='sbb' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="unsigned char" name="_subborrow_u64">
+	<type>Integer</type>
+	<category>Arithmetic</category>
+	<parameter varname="b_in" type="unsigned char"/>
+	<parameter varname="a" type="unsigned __int64"/>
+	<parameter varname="b" type="unsigned __int64"/>
+	<parameter varname="out" type="unsigned __int64 *"/>
+	<description>Add unsigned 8-bit borrow "b_in" (carry flag) to unsigned 64-bit integer "b", and subtract the result from unsigned 64-bit integer "a". Store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+dst:out[63:0] := (a[63:0] - (b[63:0] + b_in));
+	</operation>
+	<instruction name='sbb' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="unsigned char" name="_addcarryx_u32">
+	<type>Integer</type>
+	<CPUID>ADX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="c_in" type="unsigned char"/>
+	<parameter varname="a" type="unsigned int"/>
+	<parameter varname="b" type="unsigned int"/>
+	<parameter varname="out" type="unsigned int *"/>
+		<description>Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+dst:out[31:0] := a[31:0] + b[31:0] + c_in;
+	</operation>
+	<instruction name='adcx' form='r32, r32'/>
+	<instruction name='adox' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="unsigned char" name="_addcarryx_u64">
+	<type>Integer</type>
+	<CPUID>ADX</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="c_in" type="unsigned char"/>
+	<parameter varname="a" type="unsigned __int64"/>
+	<parameter varname="b" type="unsigned __int64"/>
+	<parameter varname="out" type="unsigned __int64 *"/>
+	<description>Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag).</description>
+	<operation>
+dst:out[63:0] := a[63:0] + b[63:0] + c_in;
+	</operation>
+	<instruction name='adcx' form='r64, r64'/>
+	<instruction name='adox' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="__m128i" name="_mm_sha1msg1_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst".</description>
+	<operation>
+W0 := a[127:96];
+W1 := a[95:64];
+W2 := a[63:32];
+W3 := a[31:0];
+W4 := b[127:96];
+W5 := b[95:64];
+
+dst[127:96] := W2 XOR W0;
+dst[95:64] := W3 XOR W1;
+dst[63:32] := W4 XOR W2;
+dst[31:0] := W5 XOR W3;
+	</operation>
+	<instruction name='sha1msg1' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="__m128i" name="_mm_sha1msg2_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in "a" and the previous message values in "b", and store the result in "dst".</description>
+	<operation>
+W13 := b[95:64];
+W14 := b[63:32];
+W15 := b[31:0];
+W16 := (a[127:96] XOR W13) &lt;&lt;&lt; 1;
+W17 := (a[95:64] XOR W14) &lt;&lt;&lt; 1;
+W18 := (a[63:32] XOR W15) &lt;&lt;&lt; 1;
+W19 := (a[31:0] XOR W16) &lt;&lt;&lt; 1;
+
+dst[127:96] := W16;
+dst[95:64] := W17;
+dst[63:32] := W18;
+dst[31:0] := W19;
+	</operation>
+	<instruction name='sha1msg2' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="__m128i" name="_mm_sha1nexte_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable "a", add that value to the scheduled values (unsigned 32-bit integers) in "b", and store the result in "dst".</description>
+	<operation>
+tmp := (a[127:96] &lt;&lt;&lt; 30);
+dst[127:96] := b[127:96] + tmp;
+dst[95:64] := b[95:64];
+dst[63:32] := b[63:32];
+dst[31:0] := b[31:0];
+	</operation>
+	<instruction name='sha1nexte' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="__m128i" name="_mm_sha1rnds4_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="func" type="const int"/>
+	<description>Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from "a" and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from "b", and store the updated SHA1 state (A,B,C,D) in "dst". "func" contains the logic functions and round constants.</description>
+	<operation>
+IF (func[1:0] = 0) THEN
+	f() := f0(), K := K0;
+ELSE IF (func[1:0] = 1) THEN
+	f() := f1(), K := K1;
+ELSE IF (func[1:0] = 2) THEN
+	f() := f2(), K := K2;
+ELSE IF (func[1:0] = 3) THEN
+	f() := f3(), K := K3;
+FI;
+
+A := a[127:96];
+B := a[95:64];
+C := a[63:32];
+D := a[31:0];
+
+W[0] := b[127:96];
+W[1] := b[95:64];
+W[2] := b[63:32];
+W[3] := b[31:0];
+
+A[1] := f(B, C, D) + (A &lt;&lt;&lt; 5) + W[0] + K;
+B[1] := A;
+C[1] := B &lt;&lt;&lt; 30;
+D[1] := C;
+E[1] := D;
+
+FOR i = 1 to 3
+		A[i+1] := f(B[i], C[i], D[i]) + (A[i] &lt;&lt;&lt; 5) + W[i] + E[i] + K;
+		B[i+1] := A[i];
+		C[i+1] := B[i] &lt;&lt;&lt; 30;
+		D[i+1] := C[i];
+		E[i+1] := D[i];
+ENDFOR;
+
+dst[127:96] := A[4];
+dst[95:64] := B[4];
+dst[63:32] := C[4];
+dst[31:0] := D[4];
+	</operation>
+	<instruction name='sha1rnds4' form='xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="__m128i" name="_mm_sha256msg1_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst".</description>
+	<operation>
+W4 := b[31:0];
+W3 := a[127:96];
+W2 := a[95:64];
+W1 := a[63:32];
+W0 := a[31:0];
+
+dst[127:96] := W3 + sigma0(W4);
+dst[95:64] := W2 + sigma0(W3);
+dst[63:32] := W1 + sigma0(W2);
+dst[31:0] := W0 + sigma0(W1);
+	</operation>
+	<instruction name='sha256msg1' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="__m128i" name="_mm_sha256msg2_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst"."</description>
+	<operation>
+W14 := b[95:64];
+W15 := b[127:96];
+W16 := a[31:0] + sigma1(W14);
+W17 := a[63:32] + sigma1(W15);
+W18 := a[95:64] + sigma1(W16);
+W19 := a[127:96] + sigma1(W17);
+
+dst[127:96] := W19;
+dst[95:64] := W18;
+dst[63:32] := W17;
+dst[31:0] := W16;
+	</operation>
+	<instruction name='sha256msg2' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="__m128i" name="_mm_sha256rnds2_epu32">
+	<type>Integer</type>
+	<CPUID>SHA</CPUID>
+	<category>Cryptography</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="k" type="__m128i"/>
+	<description>Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from "a", an initial SHA256 state (A,B,E,F) from "b", and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from "k", and store the updated SHA256 state (A,B,E,F) in "dst".</description>
+	<operation>
+A[0] := b[127:96];
+B[0] := b[95:64];
+C[0] := a[127:96];
+D[0] := a[95:64];
+E[0] := b[63:32];
+F[0] := b[31:0];
+G[0] := a[63:32];
+H[0] := a[31:0];
+
+W_K0 := k[31:0];
+W_K1 := k[63:32];
+
+FOR i = 0 to 1
+		A_(i+1) := Ch(E[i], F[i], G[i]) + sum1(E[i]) + WKi + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i]);
+		B_(i+1) := A[i];
+		C_(i+1) := B[i];
+		D_(i+1) := C[i];
+		E_(i+1) := Ch(E[i], F[i], G[i]) + sum1(E[i]) + WKi + H[i] + D[i];
+		F_(i+1) := E[i];
+		G_(i+1) := F[i];
+		H_(i+1) := G[i];
+ENDFOR;
+
+dst[127:96] := A[2];
+dst[95:64] := B[2];
+dst[63:32] := E[2];
+dst[31:0] := F[2];
+	</operation>
+	<instruction name='sha256rnds2' form='xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="void *" name="_bnd_set_ptr_bounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="srcmem" type="const void *"/>
+	<parameter varname="size" type="size_t"/>
+	<description>Make a pointer with the value of "srcmem" and bounds set to ["srcmem", "srcmem" + "size" - 1], and store the result in "dst".</description>
+	<operation>
+dst := srcmem;
+dst.LB := srcmem.LB;
+dst.UB := srcmem + size - 1;
+	</operation>
+	<instruction name='bndmk' form='bnd, m32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="void *" name="_bnd_narrow_ptr_bounds" sequence="true">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="q" type="const void *"/>
+	<parameter varname="r" type="const void *"/>
+	<parameter varname="size" type="size_t"/>
+	<description>Narrow the bounds for pointer "q" to the intersection of the bounds of "r" and the bounds ["q", "q" + "size" - 1], and store the result in "dst".</description>
+	<operation>
+dst := q;
+IF r.LB &gt; (q + size - 1) OR r.UB &lt; q THEN
+	dst.LB := 1;
+	dst.UB := 0;
+ELSE
+	dst.LB := MAX(r.LB, q);
+	dst.UB := MIN(r.UB, (q + size - 1));
+FI;
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="void *" name="_bnd_copy_ptr_bounds" sequence="true">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="q" type="const void *"/>
+	<parameter varname="r" type="const void *"/>
+	<description>Make a pointer with the value of "q" and bounds set to the bounds of "r" (e.g. copy the bounds of "r" to pointer "q"), and store the result in "dst".</description>
+	<operation>
+dst := q;
+dst.LB := r.LB;
+dst.UB := r.UB;
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="void *" name="_bnd_init_ptr_bounds" sequence="true">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="q" type="const void *"/>
+	<description>Make a pointer with the value of "q" and open bounds, which allow the pointer to access the entire virtual address space, and store the result in "dst".</description>
+	<operation>
+dst := q;
+dst.LB := 0;
+dst.UB := 0;
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="void" name="_bnd_store_ptr_bounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="ptr_addr" type="const void **"/>
+	<parameter varname="ptr_val" type="const void *"/>
+	<description>Stores the bounds of "ptr_val" pointer in memory at address "ptr_addr".</description>
+	<operation>
+MEM[ptr_addr].LB := ptr_val.LB;
+MEM[ptr_addr].UB := ptr_val.UB;
+	</operation>
+	<instruction name='bndstx' form='mib, bnd'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="void" name="_bnd_chk_ptr_lbounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="q" type="const void *"/>
+	<description>Checks if "q" is within its lower bound, and throws a #BR if not.</description>
+	<operation>
+IF q &lt; q.LB THEN
+	#BR;
+FI;
+	</operation>
+	<instruction name='bndcl' form='bnd, m32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="void" name="_bnd_chk_ptr_ubounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="q" type="const void *"/>
+	<description>Checks if "q" is within its upper bound, and throws a #BR if not.</description>
+	<operation>
+IF q &gt; q.UB THEN
+	#BR;
+FI;
+	</operation>
+	<instruction name='bndcu' form='bnd, m32'/>
+	<instruction name='bndcn' form='bnd, m32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="void" name="_bnd_chk_ptr_bounds">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="q" type="const void *"/>
+	<parameter varname="size" type="size_t"/>
+	<description>Checks if ["q", "q" + "size" - 1] is within the lower and upper bounds of "q" and throws a #BR if not.</description>
+	<operation>
+IF (q + size - 1) &lt; q.LB OR (q + size - 1) &gt; q.UB THEN
+	#BR;
+FI;
+	</operation>
+	<instruction name='bndcu' form='bnd, m32'/>
+	<instruction name='bndcn' form='bnd, m32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="const void *" name="_bnd_get_ptr_lbound" sequence="true">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="q" type="const void *"/>
+	<description>Return the lower bound of "q".</description>
+	<operation>
+dst := q.LB
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype="const void *" name="_bnd_get_ptr_ubound" sequence="true">
+	<CPUID>MPX</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="q" type="const void *"/>
+	<description>Return the upper bound of "q".</description>
+	<operation>
+dst := q.UB
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="short" name="_loadbe_i16" sequence="true">
+	<category>Load</category>
+	<parameter varname="ptr" type="void const *"/>
+	<description>Loads a big-endian word (16-bit) value from address "ptr" and stores the result in "dst".
+	</description>
+	<operation>
+addr := MEM[ptr]
+FOR j := 0 to 1
+	i := j*8
+	dst[i+7:i] := addr[15-i:15-i-7]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="int" name="_loadbe_i32" sequence="true">
+	<category>Load</category>
+	<parameter varname="ptr" type="void const *"/>
+	<description>Loads a big-endian double word (32-bit) value from address "ptr" and stores the result in "dst".
+	</description>
+	<operation>
+addr := MEM[ptr]
+FOR j := 0 to 4
+	i := j*8
+	dst[i+7:i] := addr[31-i:31-i-7]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__int64" name="_loadbe_i64" sequence="true">
+	<category>Load</category>
+	<parameter varname="ptr" type="void const *"/>
+	<description>Loads a big-endian quad word (64-bit) value from address "ptr" and stores the result in "dst".
+	</description>
+	<operation>
+addr := MEM[ptr]
+FOR j := 0 to 8
+	i := j*8
+	dst[i+7:i] := addr[63-i:63-i-7]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_storebe_i16" sequence="true">
+	<category>Store</category>
+	<parameter varname="ptr" type="void *"/>
+	<parameter varname="data" type="short"/>
+	<description>Stores word-sized (16-bit) "data" to address "ptr" in big-endian format.
+	</description>
+	<operation>
+addr := MEM[ptr]
+FOR j := 0 to 1
+	i := j*8
+	addr[i+7:i] := data[15-i:15-i-7]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_storebe_i32" sequence="true">
+	<category>Store</category>
+	<parameter varname="ptr" type="void *"/>
+	<parameter varname="data" type="int"/>
+	<description>Stores double word-sized (32-bit) "data" to address "ptr" in big-endian format.
+	</description>
+	<operation>
+addr := MEM[ptr]
+FOR j := 0 to 4
+	i := j*8
+	addr[i+7:i] := data[31-i:31-i-7]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_storebe_i64" sequence="true">
+	<category>Store</category>
+	<parameter varname="ptr" type="void *"/>
+	<parameter varname="data" type="__int64"/>
+	<description>Stores quad word-sized (64-bit) "data" to address "ptr" in big-endian format.
+	</description>
+	<operation>
+addr := MEM[ptr]
+FOR j := 0 to 7
+	i := j*8
+	addr[i+7:i] := data[63-i:63-i-7]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xsavec'>
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEC</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<parameter varname='save_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>
+mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xsavec' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xsaves'>
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<parameter varname='save_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>
+mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xsaves' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xsavec64'>
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSAVEC</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<parameter varname='save_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>
+mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xsavec64' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xsaves64'>
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='void *'/>
+	<parameter varname='save_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>
+mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
+FOR i := 0 to 62
+	IF mask[i]
+		CASE (i) OF
+		0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
+		1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
+		DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
+		ESAC
+		mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xsavec64' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xrstors'>
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='const void *'/>
+	<parameter varname='rs_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>
+st_mask = mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xrstors' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='void' name='_xrstors64'>
+	<CPUID>XSAVE</CPUID>
+	<CPUID>XSS</CPUID>
+	<category>OS-Targeted</category>
+	<parameter varname='mem_addr' type='const void *'/>
+	<parameter varname='rs_mask' type='unsigned __int64'/>
+	<description>Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary.</description>
+	<operation>
+st_mask = mem_addr.HEADER.XSTATE_BV[62:0]
+FOR i := 0 to 62
+	IF (rs_mask[i] AND XCR0[i])
+		IF st_mask[i]
+			CASE (i) OF
+			0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU]
+			1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
+			DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
+			ESAC
+		ELSE
+			// ProcessorExtendedState := Processor Supplied Values
+			CASE (i) OF
+			1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
+			ESAC
+		FI
+	FI
+	i := i + 1
+ENDFOR
+	</operation>
+	<instruction name='xrstors64' form='MEMmxsave'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX' vexEq='TRUE' rettype='float' name='_mm256_cvtss_f32'>
+	<type>Floating Point</type>
+	<CPUID>AVX</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m256' />
+	<description>Copy the lower single-precision (32-bit) floating-point element of "a" to "dst".</description>
+	<operation>dst[31:0] := a[31:0]</operation>
+	<instruction name='movss' form='m32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' vexEq='TRUE' rettype='double' name='_mm256_cvtsd_f64'>
+	<type>Floating Point</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Copy the lower double-precision (64-bit) floating-point element of "a" to "dst".</description>
+	<operation>dst[63:0] := a[63:0]</operation>
+	<instruction name='movsd' form='m64, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX2' vexEq='TRUE' rettype='int' name='_mm256_cvtsi256_si32'>
+	<type>Integer</type>
+	<CPUID>AVX2</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m256i'/>
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name='movd' form='r32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+
+
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_kandn">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<parameter varname="b" type="__mmask16"/>
+	<description>Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := (NOT a[15:0]) AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kandnw' form='k, k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kandn">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<parameter varname="b" type="__mmask16"/>
+	<description>Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := (NOT a[15:0]) AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kandn' form='k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_kand">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<parameter varname="b" type="__mmask16"/>
+	<description>Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kandw' form='k, k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kand">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<parameter varname="b" type="__mmask16"/>
+	<description>Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] AND b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kand' form='k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_kmov">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<description>Copy 16-bit mask "a" to "k".</description>
+	<operation>
+k[15:0] := a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kmovw' form='k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kmov">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<description>Copy 16-bit mask "a" to "k".</description>
+	<operation>
+k[15:0] := a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kmov' form='k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_knot">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<description>Compute the bitwise NOT of 16-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='knotw' form='k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_knot">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<description>Compute the bitwise NOT of 16-bit mask "a", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT a[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='knot' form='k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_kor">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<parameter varname="b" type="__mmask16"/>
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] OR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='korw' form='k, k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kor">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<parameter varname="b" type="__mmask16"/>
+	<description>Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] OR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kor' form='k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_kunpackb">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<parameter varname="b" type="__mmask16"/>
+	<description>Unpack and interleave 8 bits from masks "a" and "b", and store the 16-bit result in "k".</description>
+	<operation>
+k[7:0] := b[7:0]
+k[15:8] := a[7:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kunpckbw' form='k, k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_kxnor">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<parameter varname="b" type="__mmask16"/>
+	<description>Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT (a[15:0] XOR b[15:0])
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kxnorw' form='k, k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kxnor">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<parameter varname="b" type="__mmask16"/>
+	<description>Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := NOT (a[15:0] XOR b[15:0])
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kxnor' form='k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_kxor">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<parameter varname="b" type="__mmask16"/>
+	<description>Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] XOR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kxorw' form='k, k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kxor">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__mmask16"/>
+	<parameter varname="b" type="__mmask16"/>
+	<description>Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k".</description>
+	<operation>
+k[15:0] := a[15:0] XOR b[15:0]
+k[MAX:16] := 0
+	</operation>
+	<instruction name='kxor' form='k, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_add_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_add_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_add_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_add_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_add_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_add_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vaddps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_add_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := a[63:0] + b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vaddsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_add_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]
+		</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vaddsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_add_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". </description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vaddsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_add_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]
+		</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vaddsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_add_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] + b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vaddsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_add_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]
+		</description>
+	<operation>
+dst[31:0] := a[31:0] + b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vaddss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_add_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+		[round_note]
+		</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vaddss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_add_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". </description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vaddss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_add_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vaddss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_add_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] + b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vaddss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "count" 32-bit elements, and store the low 64 bytes (16 elements) in "dst".</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (32*count)
+dst[511:0] := temp[511:0]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='valignd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "count" 32-bit elements, and store the low 64 bytes (16 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (32*count)
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='valignd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "count" 32-bit elements, and stores the low 64 bytes (16 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (32*count)
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='valignd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "count" 64-bit elements, and store the low 64 bytes (8 elements) in "dst".</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (64*count)
+dst[511:0] := temp[511:0]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='valignq' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "count" 64-bit elements, and store the low 64 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (64*count)
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='valignq' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "count" 64-bit elements, and stores the low 64 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[1023:512] := a[511:0]
+temp[511:0] := b[511:0]
+temp[1023:0] := temp[1023:0] &gt;&gt; (64*count)
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='valignq' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_blend_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vblendmpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_blend_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vblendmps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastf32x4' form='zmm {k}, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastf32x4' form='zmm {k}, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastf32x4' form='zmm {k}, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_broadcast_f64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m256d"/>
+	<description>Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 4)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastf64x4' form='zmm {k}, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_broadcast_f64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[n+63:n]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastf64x4' form='zmm {k}, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_broadcast_f64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastf64x4' form='zmm {k}, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_broadcast_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcasti32x4' form='zmm {k}, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_broadcast_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[n+31:n]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcasti32x4' form='zmm {k}, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_broadcast_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcasti32x4' form='zmm {k}, m128'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_broadcast_i64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 4)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcasti64x4' form='zmm {k}, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_broadcast_i64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[n+63:n]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcasti64x4' form='zmm {k}, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_broadcast_i64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 4)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcasti64x4' form='zmm {k}, m256'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastsd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastsd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastsd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastss' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastss' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vbroadcastss' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_cmp_round_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="sae" type="const int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_cmpeq_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0
+ENDFOR	
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_cmple_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] &lt;= b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_cmplt_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] &lt; b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_cmpneq_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_cmpnle_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := !(a[i+63:i] &lt;= b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_cmpnlt_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := !(a[i+63:i] &lt; b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_cmpord_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_cmpunord_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_mask_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_mask_cmp_round_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="sae" type="const int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_mask_cmpeq_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR	
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_mask_cmple_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] &lt;= b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_mask_cmplt_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] &lt; b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_mask_cmpneq_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_mask_cmpnle_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := !(a[i+63:i] &lt;= b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_mask_cmpnlt_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := !(a[i+63:i] &lt; b[i+63:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_mask_cmpord_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask8" name="_mm512_mask_cmpunord_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vcmppd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmp_round_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="sae" type="const int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpeq_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0
+ENDFOR	
+k[MAX:16] := 0		
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmple_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] &lt;= b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmplt_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] &lt; b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpneq_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpnle_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := !(a[i+31:i] &lt;= b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpnlt_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := !(a[i+31:i] &lt; b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpord_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] != NaN AND b[i+31:i] != NaN) ? 1 : 0 
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpunord_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := (a[i+31:i] == NaN OR b[i+31:i] == NaN) ? 1 : 0 
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmp_round_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="sae" type="const int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpeq_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>y
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR		
+k[MAX:16] := 0	
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmple_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] &lt;= b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmplt_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] &lt; b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpneq_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpnle_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := !(a[i+31:i] &lt;= b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpnlt_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := !(a[i+31:i] &lt; b[i+31:i]) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpord_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] != NaN AND b[i+31:i] != NaN) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpunord_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := (a[i+31:i] == NaN OR b[i+31:i] == NaN) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vcmpps' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_round_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="sae" type="const int"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+
+k[MAX:1] := 0
+	</operation>
+	<instruction name='vcmpsd' form='k {k}, xmm, xmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+
+k[MAX:1] := 0
+	</operation>
+	<instruction name='vcmpsd' form='k {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_round_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="sae" type="const int"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+IF k1[0]
+	k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name='vcmpsd' form='k {k}, xmm, xmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). </description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+IF k1[0]
+	k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name='vcmpsd' form='k {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_round_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="sae" type="const int"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+
+k[MAX:1] := 0
+	</operation>
+	<instruction name='vcmpss' form='k {k}, xmm, xmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+
+k[MAX:1] := 0
+	</operation>
+	<instruction name='vcmpss' form='k {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_round_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="sae" type="const int"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+IF k1[0]
+	k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name='vcmpss' form='k {k}, xmm, xmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). </description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+IF k1[0]
+	k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name='vcmpss' form='k {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_comi_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="sae" type="const int"/>
+	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1).
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0
+	</operation>
+	<instruction name='vcomisd' form='xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_comi_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="sae" type="const int"/>
+	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1).
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+
+RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0
+	</operation>
+	<instruction name='vcomiss' form='xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcompresspd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_compressstoreu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vcompresspd' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcompresspd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcompressps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_compressstoreu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vcompressps' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcompressps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtdq2pd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := src[m+63:m]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtdq2pd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtdq2pd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_cvt_roundepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtdq2ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtdq2ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_cvt_roundepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtdq2ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtdq2ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_cvt_roundepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtdq2ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtdq2ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2dq' form='ymm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2dq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2dq' form='ymm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2dq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2dq' form='ymm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2dq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_cvt_roundpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2ps' form='ymm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2ps' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_mask_cvt_roundpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2ps' form='ymm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_mask_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2ps' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_maskz_cvt_roundpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2ps' form='ymm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_maskz_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2ps' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2udq' form='ymm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2udq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2udq' form='ymm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2udq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2udq' form='ymm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtpd2udq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_cvt_roundph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtph2ps' form='zmm {k}, ymm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtph2ps' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_cvt_roundph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtph2ps' form='zmm {k}, ymm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtph2ps' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_cvt_roundph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtph2ps' form='zmm {k}, ymm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtph2ps' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2dq' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2dq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2dq' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2dq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2dq' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2dq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_cvt_roundps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2pd' form='zmm {k}, ymm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_cvtps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2pd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_cvt_roundps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2pd' form='zmm {k}, ymm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_cvtps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2pd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_cvt_roundps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2pd' form='zmm {k}, ymm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_cvtps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2pd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtps2ph' form='ymm {k}, zmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtps2ph' form='ymm {k}, zmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtps2ph' form='ymm {k}, zmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtps2ph' form='ymm {k}, zmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtps2ph' form='ymm {k}, zmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvtps2ph' form='ymm {k}, zmm {sae}, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2udq' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2udq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2udq' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2udq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2udq' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtps2udq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvt_roundsd_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction name='vcvtsd2si' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvt_roundsd_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction name='vcvtsd2si' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvt_roundsd_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction name='vcvtsd2si' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvt_roundsd_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction name='vcvtsd2si' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvtsd_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+	</operation>
+	<instruction name='vcvtsd2si' form='r32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvtsd_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+	</operation>
+	<instruction name='vcvtsd2si' form='r64, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvt_roundsd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+dst[127:32] := a[127:31]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vcvtsd2ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_cvt_roundsd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:31]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vcvtsd2ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_cvtsd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:31]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vcvtsd2ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_cvt_roundsd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:31]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vcvtsd2ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_cvtsd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := Convert_FP64_To_FP32(b[63:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:31]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vcvtsd2ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned int" name="_mm_cvt_roundsd_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])
+	</operation>
+	<instruction name='vcvtsd2usi' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned __int64" name="_mm_cvt_roundsd_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])
+	</operation>
+	<instruction name='vcvtsd2usi' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned int" name="_mm_cvtsd_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])
+	</operation>
+	<instruction name='vcvtsd2usi' form='r32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned __int64" name="_mm_cvtsd_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])
+	</operation>
+	<instruction name='vcvtsd2usi' form='r64, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_cvt_roundi64_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__int64"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtsi2sd' form='xmm, xmm, r64 {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_cvt_roundsi64_sd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__int64"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtsi2sd' form='xmm, xmm, r64 {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_cvti32_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="int"/>
+	<description>Convert the 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
+	<operation>
+dst[63:0] := Convert_Int32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtsi2sd' form='xmm, xmm, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_cvti64_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__int64"/>
+	<description>Convert the 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
+	<operation>
+dst[63:0] := Convert_Int64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtsi2sd' form='xmm, xmm, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvt_roundi32_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtsi2ss' form='xmm, xmm, r32 {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvt_roundi64_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__int64"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtsi2ss' form='xmm, xmm, r64 {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvt_roundsi32_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtsi2ss' form='xmm, xmm, r32 {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvt_roundsi64_ss">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__int64"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtsi2ss' form='xmm, xmm, r64 {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvti32_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="int"/>
+	<description>Convert the 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtsi2ss' form='xmm, xmm, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvti64_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__int64"/>
+	<description>Convert the 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtsi2ss' form='xmm, xmm, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_cvt_roundss_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vcvtss2sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_cvt_roundss_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vcvtss2sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_cvtss_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vcvtss2sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_cvt_roundss_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vcvtss2sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_cvtss_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vcvtss2sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvt_roundss_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction name='vcvtss2si' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvt_roundss_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction name='vcvtss2si' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvt_roundss_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction name='vcvtss2si' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvt_roundss_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction name='vcvtss2si' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvtss_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+	</operation>
+	<instruction name='vcvtss2si' form='r32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvtss_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+	</operation>
+	<instruction name='vcvtss2si' form='r64, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned int" name="_mm_cvt_roundss_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])
+	</operation>
+	<instruction name='vcvtss2usi' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned __int64" name="_mm_cvt_roundss_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])
+	</operation>
+	<instruction name='vcvtss2usi' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned int" name="_mm_cvtss_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])
+	</operation>
+	<instruction name='vcvtss2usi' form='r32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned __int64" name="_mm_cvtss_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])
+	</operation>
+	<instruction name='vcvtss2usi' form='r64, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*i
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2dq' form='ymm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2dq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*i
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2dq' form='ymm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2dq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtt_roundpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*i
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2dq' form='ymm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2dq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*i
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2udq' form='ymm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2udq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*i
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2udq' form='ymm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2udq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtt_roundpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*i
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2udq' form='ymm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vcvttpd2udq' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*i
+	dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2dq' form='zmm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2dq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions. </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*i
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2dq' form='zmm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2dq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtt_roundps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*i
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2dq' form='zmm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2dq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*i
+	dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2udq' form='zmm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2udq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*i
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2udq' form='zmm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2udq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtt_roundps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*i
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2udq' form='zmm {k}, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvttps2udq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvtt_roundsd_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction name='vcvttsd2si' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvtt_roundsd_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction name='vcvttsd2si' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvtt_roundsd_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction name='vcvttsd2si' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvtt_roundsd_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction name='vcvttsd2si' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvttsd_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+	</operation>
+	<instruction name='vcvttsd2si' form='r32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvttsd_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+	</operation>
+	<instruction name='vcvttsd2si' form='r64, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned int" name="_mm_cvtt_roundsd_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])
+	</operation>
+	<instruction name='vcvttsd2usi' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned __int64" name="_mm_cvtt_roundsd_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])
+	</operation>
+	<instruction name='vcvttsd2usi' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned int" name="_mm_cvttsd_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])
+	</operation>
+	<instruction name='vcvttsd2usi' form='r32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned __int64" name="_mm_cvttsd_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])
+	</operation>
+	<instruction name='vcvttsd2usi' form='r64, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvtt_roundss_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction name='vcvttss2si' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvtt_roundss_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+	</operation>
+	<instruction name='vcvttss2si' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvtt_roundss_si32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction name='vcvttss2si' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvtt_roundss_si64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+	</operation>
+	<instruction name='vcvttss2si' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm_cvttss_i32">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+	</operation>
+	<instruction name='vcvttss2si' form='r32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__int64" name="_mm_cvttss_i64">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+	</operation>
+	<instruction name='vcvttss2si' form='r64, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned int" name="_mm_cvtt_roundss_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])
+	</operation>
+	<instruction name='vcvttss2usi' form='r32, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned __int64" name="_mm_cvtt_roundss_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])
+	</operation>
+	<instruction name='vcvttss2usi' form='r64, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned int" name="_mm_cvttss_u32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])
+	</operation>
+	<instruction name='vcvttss2usi' form='r32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="unsigned __int64" name="_mm_cvttss_u64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst".</description>
+	<operation>
+dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])
+	</operation>
+	<instruction name='vcvttss2usi' form='r64, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtudq2pd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtudq2pd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtudq2pd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_cvt_roundepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtudq2ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_cvtepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtudq2ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_cvt_roundepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtudq2ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_cvtepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtudq2ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_cvt_roundepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtudq2ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_cvtepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vcvtudq2ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_cvt_roundu64_sd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="unsigned __int64"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtusi2sd' form='xmm, xmm, r64 {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_cvtu32_sd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="unsigned int"/>
+	<description>Convert the unsigned 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_UnsignedInt32_To_FP64(b[31:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtusi2sd' form='xmm, xmm, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_cvtu64_sd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="unsigned __int64"/>
+	<description>Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtusi2sd' form='xmm, xmm, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvt_roundu32_ss">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="unsigned int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtusi2ss' form='xmm, xmm, r32 {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvt_roundu64_ss">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="unsigned __int64"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtusi2ss' form='xmm, xmm, r64 {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvtu32_ss">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="unsigned int"/>
+	<description>Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtusi2ss' form='xmm, xmm, r32 {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvtu64_ss">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="unsigned __int64"/>
+	<description>Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vcvtusi2ss' form='xmm, xmm, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_div_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", =and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := a[i+63:i] / b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_div_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>
+	Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_div_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_div_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := a[i+31:i] / b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_div_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>
+	Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_div_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vdivps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_div_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+		[round_note]
+		</description>
+	<operation>
+dst[63:0] := a[63:0] / b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vdivsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_div_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". 
+		[round_note]
+		</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vdivsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_div_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". </description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vdivsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_div_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]
+		</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vdivsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_div_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] / b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vdivsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_div_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]
+		</description>
+	<operation>
+dst[31:0] := a[31:0] / b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vdivss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_div_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+		[round_note]
+		</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vdivss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_div_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". </description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vdivss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_div_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]
+		</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vdivss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_div_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] / b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vdivss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexpandpd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexpandpd' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexpandpd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexpandpd' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexpandps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexpandps' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexpandps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexpandps' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm512_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vextractf32x4' form='xmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm512_mask_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vextractf32x4' form='xmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm512_maskz_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vextractf32x4' form='xmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm512_extractf64x4_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vextractf64x4' form='ymm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm512_mask_extractf64x4_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vextractf64x4' form='ymm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm512_maskz_extractf64x4_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vextractf64x4' form='ymm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vextracti32x4' form='xmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_mask_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vextracti32x4' form='xmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_maskz_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vextracti32x4' form='xmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_extracti64x4_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vextracti64x4' form='ymm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_extracti64x4_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vextracti64x4' form='ymm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_extracti64x4_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vextracti64x4' form='ymm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmpd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_fixupimm_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmpd' form='zmm {k}, zmm, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmpd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_fixupimm_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmpd' form='zmm {k}, zmm, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmpd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fixupimm_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmpd' form='zmm {k}, zmm, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmps' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_fixupimm_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmps' form='zmm {k}, zmm, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmps' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_fixupimm_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmps' form='zmm {k}, zmm, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmps' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fixupimm_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfixupimmps' form='zmm {k}, zmm, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_fixupimm_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmsd' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_fixupimm_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmsd' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fixupimm_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmsd' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fixupimm_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmsd' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fixupimm_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmsd' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fixupimm_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+IF k[0]
+	dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmsd' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_fixupimm_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmss' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_fixupimm_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmss' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fixupimm_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmss' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fixupimm_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmss' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fixupimm_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.
+	[round_note]
+	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmss' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fixupimm_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+IF k[0]
+	dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfixupimmss' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_fmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask3_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask3_fmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE 
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_fmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_fmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask3_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask3_fmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_fmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "a" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		a[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_fmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd231sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd231sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd231sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_fmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmadd231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmadd132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmadd231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF (j is even) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_fmaddsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF (j is even) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask3_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask3_fmaddsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE 
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_fmaddsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fmaddsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF (j is even) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_fmaddsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF (j is even) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask3_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask3_fmaddsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_fmaddsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmaddsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fmaddsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmaddsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmaddsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_fmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask3_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask3_fmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_fmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_fmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask3_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask3_fmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_fmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_fmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub231sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub231sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub231sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_fmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfmsub231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfmsub132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfmsub231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF (j is even) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_fmsubadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF (j is even) 
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask3_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask3_fmsubadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_fmsubadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fmsubadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF (j is even) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_fmsubadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF (j is even) 
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask3_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask3_fmsubadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_fmsubadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfmsubadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fmsubadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfmsubadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfmsubadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_fnmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".
+	 [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask3_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask3_fnmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_fnmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fnmadd_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". 
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_fnmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".  
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask3_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask3_fnmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_fnmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmadd231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fnmadd_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmadd132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmadd231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_fnmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fnmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fnmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd231sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fnmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fnmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd231sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fnmadd_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fnmadd_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd231sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd132sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_fnmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fnmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fnmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fnmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fnmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fnmadd_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmadd231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fnmadd_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmadd132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmadd231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_fnmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst".  
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask3_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask3_fnmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_fnmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub213pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub231pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_fnmsub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="c" type="__m512d"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub213pd' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub231pd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_fnmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask3_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask3_fnmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_fnmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub213ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vfnmsub231ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_fnmsub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="c" type="__m512"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vfnmsub132ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub213ps' form='zmm {k}, zmm, zmm {er}'/>
+	<instruction name='vfnmsub231ps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_fnmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fnmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fnmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := c[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub231sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fnmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fnmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := a[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub231sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fnmsub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub213sd' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub231sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fnmsub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub213sd' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub231sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_fnmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fnmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fnmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := c[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fnmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fnmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := a[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fnmsub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub213ss' form='xmm {k}, xmm, xmm {er}'/>
+	<instruction name='vfnmsub231ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fnmsub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vfnmsub132ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub213ss' form='xmm {k}, xmm, xmm'/>
+	<instruction name='vfnmsub231ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgatherdpd' form='zmm {k}, vm32y'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:8] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgatherdpd' form='zmm {k}, vm32y'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgatherdps' form='zmm {k}, vm32z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:16] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgatherdps' form='zmm {k}, vm32z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgatherqpd' form='zmm {k}, vm32z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:8] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgatherqpd' form='zmm {k}, vm32z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+		<category>Load</category>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vgatherqps' form='ymm {k}, vm64z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_mask_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:8] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vgatherqps' form='ymm {k}, vm64z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexppd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_getexp_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexppd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexppd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_getexp_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexppd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexppd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_getexp_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexppd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexpps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_getexp_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexpps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexpps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_getexp_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexpps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexpps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_getexp_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetexpps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_getexp_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := ConvertExpFP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_getexp_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>
+dst[63:0] := ConvertExpFP64(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_getexp_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_getexp_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_getexp_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_getexp_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := ConvertExpFP64(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_getexp_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := ConvertExpFP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_getexp_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>
+dst[31:0] := ConvertExpFP32(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_getexp_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_getexp_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_getexp_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_getexp_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := ConvertExpFP32(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetexpss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantpd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_getmant_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantpd' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantpd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_getmant_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantpd' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantpd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_getmant_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantpd' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantps' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_getmant_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantps' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantps' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_getmant_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantps' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantps' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_getmant_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vgetmantps' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_getmant_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantsd' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_getmant_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantsd' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_getmant_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantsd' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_getmant_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantsd' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_getmant_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantsd' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_getmant_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+IF k[0]
+	dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantsd' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_getmant_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantss' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_getmant_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantss' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_getmant_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantss' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_getmant_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantss' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_getmant_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note][round_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantss' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_getmant_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+IF k[0]
+	dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vgetmantss' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[1:0]) of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinsertf32x4' form='zmm {k}, zmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinsertf32x4' form='zmm {k}, zmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinsertf32x4' form='zmm {k}, zmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_insertf64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[0]) of
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinsertf64x4' form='zmm {k}, zmm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_insertf64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) of
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinsertf64x4' form='zmm {k}, zmm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_insertf64x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) of
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinsertf64x4' form='zmm {k}, zmm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_inserti32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[1:0]) of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinserti32x4' form='zmm {k}, zmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_inserti32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinserti32x4' form='zmm {k}, zmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_inserti32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinserti32x4' form='zmm {k}, zmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_inserti64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[7:0]) OF
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinserti64x4' form='zmm {k}, zmm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_inserti64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) of
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinserti64x4' form='zmm {k}, zmm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_inserti64x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[0]) of
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vinserti64x4' form='zmm {k}, zmm, ymm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_max_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxpd' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_max_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxpd' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_max_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxpd' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_max_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxps' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_max_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxps' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_max_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmaxps' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_max_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaxsd' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_max_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaxsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_max_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaxsd' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_max_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MAX(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaxsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_max_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+dst[63:0] := MAX(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaxsd' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_max_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaxss' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_max_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaxss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_max_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaxss' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_max_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MAX(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaxss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_max_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+dst[31:0] := MAX(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmaxss' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_min_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminpd' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_min_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminpd' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_min_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminpd' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_min_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminps' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_min_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminps' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_min_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vminps' form='zmm {k}, zmm, zmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_min_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vminsd' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_min_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vminsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_min_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vminsd' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_min_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MIN(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vminsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_min_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" , and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+dst[63:0] := MIN(a[63:0], b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vminsd' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_min_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vminss' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_min_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vminss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_min_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vminss' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_min_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MIN(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vminss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_min_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="sae" type="int"/>
+	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+dst[31:0] := MIN(a[31:0], b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vminss' form='xmm {k}, xmm, xmm {sae}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovapd' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovapd' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovapd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_store_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmovapd' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovapd' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovapd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_store_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory.
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name='vmovapd' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovaps' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovaps' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovaps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_store_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmovaps' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovaps' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovaps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_store_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name='vmovaps' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+tmp[319:256] := a[319:256] 
+tmp[383:320] := a[319:256] 
+tmp[447:384] := a[447:384]
+tmp[511:448] := a[447:384]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovddup' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+tmp[319:256] := a[319:256] 
+tmp[383:320] := a[319:256] 
+tmp[447:384] := a[447:384]
+tmp[511:448] := a[447:384]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovddup' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst".
+	</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+tmp[319:256] := a[319:256] 
+tmp[383:320] := a[319:256] 
+tmp[447:384] := a[447:384]
+tmp[511:448] := a[447:384]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovddup' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqa32' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_load_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load 512-bits of integer data from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqa32' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqa32' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqa32' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_store_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmovdqa32' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqa32' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqa32' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_store_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name='vmovdqa32' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_store_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Store 512-bits of integer data from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name='vmovdqa32' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqa64' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqa64' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqa64' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_store_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmovdqa64' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqa64' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqa64' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_store_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name='vmovdqa64' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_loadu_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load 512-bits of integer data from memory into "dst".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqu32' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqu32' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmovdqu32' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqu32' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_storeu_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Store 512-bits of integer data from "a" into memory.
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name='vmovdqu32' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqu64' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_storeu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmovdqu64' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovdqu64' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_stream_load_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load 512-bits of integer data from memory into "dst" using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovntdqa' form='zmm, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_stream_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Store 512-bits of integer data from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name='vmovntdq' form='m512, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_stream_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name='vmovntpd' form='m512, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_stream_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. 
+	"mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name='vmovntps' form='m512, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_load_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="const double*"/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MEM[mem_addr+63:mem_addr]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vmovsd' form='xmm {k}, m64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_move_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmovsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_store_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="double*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Store the lower double-precision (64-bit) floating-point element from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	MEM[mem_addr+63:mem_addr] := a[63:0]
+FI
+	</operation>
+	<instruction name='vmovsd' form='m64 {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_load_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="const double*"/>
+	<description>Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := MEM[mem_addr+63:mem_addr]
+ELSE
+	dst[63:0] := 0
+FI
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vmovsd' form='xmm {k}, m64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_move_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmovsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+tmp[287:256] := a[319:288] 
+tmp[319:288] := a[319:288] 
+tmp[351:320] := a[383:352] 
+tmp[383:352] := a[383:352] 
+tmp[415:384] := a[447:416] 
+tmp[447:416] := a[447:416] 
+tmp[479:448] := a[511:480]
+tmp[511:480] := a[511:480]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovshdup' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+tmp[287:256] := a[319:288] 
+tmp[319:288] := a[319:288] 
+tmp[351:320] := a[383:352] 
+tmp[383:352] := a[383:352] 
+tmp[415:384] := a[447:416] 
+tmp[447:416] := a[447:416] 
+tmp[479:448] := a[511:480]
+tmp[511:480] := a[511:480]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovshdup' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".
+	</description>
+	<operation>
+dst[31:0] := a[63:32] 
+dst[63:32] := a[63:32] 
+dst[95:64] := a[127:96] 
+dst[127:96] := a[127:96]
+dst[159:128] := a[191:160] 
+dst[191:160] := a[191:160] 
+dst[223:192] := a[255:224] 
+dst[255:224] := a[255:224]
+dst[287:256] := a[319:288] 
+dst[319:288] := a[319:288] 
+dst[351:320] := a[383:352] 
+dst[383:352] := a[383:352] 
+dst[415:384] := a[447:416] 
+dst[447:416] := a[447:416] 
+dst[479:448] := a[511:480]
+dst[511:480] := a[511:480]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovshdup' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+tmp[287:256] := a[287:256] 
+tmp[319:288] := a[287:256] 
+tmp[351:320] := a[351:320] 
+tmp[383:352] := a[351:320] 
+tmp[415:384] := a[415:384] 
+tmp[447:416] := a[415:384] 
+tmp[479:448] := a[479:448]
+tmp[511:480] := a[479:448]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovsldup' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+tmp[287:256] := a[287:256] 
+tmp[319:288] := a[287:256] 
+tmp[351:320] := a[351:320] 
+tmp[383:352] := a[351:320] 
+tmp[415:384] := a[415:384] 
+tmp[447:416] := a[415:384] 
+tmp[479:448] := a[479:448]
+tmp[511:480] := a[479:448]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0	
+	</operation>
+	<instruction name='vmovsldup' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".
+	</description>
+	<operation>
+dst[31:0] := a[31:0] 
+dst[63:32] := a[31:0] 
+dst[95:64] := a[95:64] 
+dst[127:96] := a[95:64]
+dst[159:128] := a[159:128] 
+dst[191:160] := a[159:128] 
+dst[223:192] := a[223:192] 
+dst[255:224] := a[223:192]
+dst[287:256] := a[287:256] 
+dst[319:288] := a[287:256] 
+dst[351:320] := a[351:320] 
+dst[383:352] := a[351:320] 
+dst[415:384] := a[415:384] 
+dst[447:416] := a[415:384] 
+dst[479:448] := a[479:448]
+dst[511:480] := a[479:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovsldup' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_load_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="const float*"/>
+	<description>Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MEM[mem_addr+31:mem_addr]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[MAX:32] := 0
+	</operation>
+	<instruction name='vmovss' form='xmm {k}, m32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_move_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmovss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_store_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="float*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Store the lower single-precision (32-bit) floating-point element from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	MEM[mem_addr+31:mem_addr] := a[31:0]
+FI
+	</operation>
+	<instruction name='vmovss' form='m32 {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_load_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="const float*"/>
+	<description>Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := MEM[mem_addr+31:mem_addr]
+ELSE
+	dst[31:0] := 0
+FI
+dst[MAX:32] := 0
+	</operation>
+	<instruction name='vmovss' form='xmm {k}, m32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_move_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmovss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovupd' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary. </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovupd' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_storeu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmovupd' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovupd' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_storeu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name='vmovupd' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+dst[511:0] := MEM[mem_addr+511:mem_addr]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovups' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovups' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_storeu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vmovups' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmovups' form='zmm {k}, m512'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_storeu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. 
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+MEM[mem_addr+511:mem_addr] := a[511:0]
+	</operation>
+	<instruction name='vmovups' form='m512 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  RM.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_mul_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_mul_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mul_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". 
+	[round_note]
+	 </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  RM.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_mul_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	 [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_mul_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mul_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". 
+	[round_note]
+	 </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vmulps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_mul_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]
+		</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmulsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_mul_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmulsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_mul_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+		[round_note]
+		</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmulsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_mul_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] * b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmulsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mul_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+		[round_note]
+		</description>
+	<operation>
+dst[63:0] := a[63:0] * b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmulsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_mul_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]
+		</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmulss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_mul_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmulss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_mul_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]
+		</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmulss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_mul_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] * b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmulss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mul_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+		[round_note]
+		</description>
+	<operation>
+dst[31:0] := a[31:0] * b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vmulss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ABS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpabsd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpabsd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpabsd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 64-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ABS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpabsq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpabsq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpabsq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] + b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpaddd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpaddd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpaddd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] + b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpaddq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpaddq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpaddq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_and_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of 512 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandnd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_andnot_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NOT of 512 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := ((NOT a[511:0]) AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandnd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandnd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandnd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+dst[511:0] := ((NOT a[511:0]) AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandnq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandnq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandnq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in "a" and "b", and store the results in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] AND b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpandq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_blend_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpblendmd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_blend_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpblendmq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_set1_epi8" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="a" type="char"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastb' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="int"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastd' form='zmm {k}, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="int"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastd' form='zmm {k}, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="a" type="int"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastd' form='zmm {k}, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_broadcastmb_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ZeroExtend(k[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastmb2q' form='zmm, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_broadcastmw_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ZeroExtend(k[15:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastmw2d' form='zmm, k'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__int64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastq' form='zmm {k}, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__int64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastq' form='zmm {k}, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="a" type="__int64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastq' form='zmm {k}, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_set1_epi16" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="a" type="short"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all all elements of "dst".
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpbroadcastw' form='ymm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpeqd' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpgtd' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpltd' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpeqd' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpgtd' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_mask_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_mask_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpltd' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpd' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpeqq' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpgtq' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpeqq' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpgtq' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpcmpud' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpcmpuq' form='k {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpcompressd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_compressstoreu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpcompressd' form='m32 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpcompressd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := src[511:m]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpcompressq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_compressstoreu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<category>Swizzle</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpcompressq' form='m64 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[511:m] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpcompressq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	FOR k := 0 to j-1
+		m := k*32
+		dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+	ENDFOR
+	dst[i+31:i+j] := 0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpconflictd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[i]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpconflictd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[i]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpconflictd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	FOR k := 0 to j-1
+		m := k*64
+		dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+	ENDFOR
+	dst[i+63:i+j] := 0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpconflictq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpconflictq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpconflictq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_permutevar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_mask_permutexvar_epi32", and it is recommended that you use that intrinsic name.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_permutevar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_permutexvar_epi32", and it is recommended that you use that intrinsic name.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask2_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2d' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermt2d' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2d' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vpermt2d' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2d' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vpermt2d' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask2_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set)</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermt2pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vpermt2pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2pd' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vpermt2pd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask2_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermt2ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vpermt2ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	off := idx[i+3:i]*32
+	dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2ps' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vpermt2ps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask2_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2q' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermt2q' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2q' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vpermt2q' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	off := idx[i+2:i]*64
+	dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermi2q' form='zmm {k}, zmm, zmm'/>
+	<instruction name='vpermt2q' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
+IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]
+IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]
+IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]
+IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]
+IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]
+IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]
+IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]
+IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilpd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
+IF (b[257] == 0) tmp_dst[319:256] := a[319:256]
+IF (b[257] == 1) tmp_dst[319:256] := a[383:320]
+IF (b[321] == 0) tmp_dst[383:320] := a[319:256]
+IF (b[321] == 1) tmp_dst[383:320] := a[383:320]
+IF (b[385] == 0) tmp_dst[447:384] := a[447:384]
+IF (b[385] == 1) tmp_dst[447:384] := a[511:448]
+IF (b[449] == 0) tmp_dst[511:448] := a[447:384]
+IF (b[449] == 1) tmp_dst[511:448] := a[511:448]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
+IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]
+IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]
+IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]
+IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]
+IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]
+IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]
+IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]
+IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilpd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
+IF (b[257] == 0) tmp_dst[319:256] := a[319:256]
+IF (b[257] == 1) tmp_dst[319:256] := a[383:320]
+IF (b[321] == 0) tmp_dst[383:320] := a[319:256]
+IF (b[321] == 1) tmp_dst[383:320] := a[383:320]
+IF (b[385] == 0) tmp_dst[447:384] := a[447:384]
+IF (b[385] == 1) tmp_dst[447:384] := a[511:448]
+IF (b[449] == 0) tmp_dst[511:448] := a[447:384]
+IF (b[449] == 1) tmp_dst[511:448] := a[511:448]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+IF (imm8[0] == 0) dst[63:0] := a[63:0]
+IF (imm8[0] == 1) dst[63:0] := a[127:64]
+IF (imm8[1] == 0) dst[127:64] := a[63:0]
+IF (imm8[1] == 1) dst[127:64] := a[127:64]
+IF (imm8[2] == 0) dst[191:128] := a[191:128]
+IF (imm8[2] == 1) dst[191:128] := a[255:192]
+IF (imm8[3] == 0) dst[255:192] := a[191:128]
+IF (imm8[3] == 1) dst[255:192] := a[255:192]
+IF (imm8[4] == 0) dst[319:256] := a[319:256]
+IF (imm8[4] == 1) dst[319:256] := a[383:320]
+IF (imm8[5] == 0) dst[383:320] := a[319:256]
+IF (imm8[5] == 1) dst[383:320] := a[383:320]
+IF (imm8[6] == 0) dst[447:384] := a[447:384]
+IF (imm8[6] == 1) dst[447:384] := a[511:448]
+IF (imm8[7] == 0) dst[511:448] := a[447:384]
+IF (imm8[7] == 1) dst[511:448] := a[511:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilpd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+IF (b[1] == 0) dst[63:0] := a[63:0]
+IF (b[1] == 1) dst[63:0] := a[127:64]
+IF (b[65] == 0) dst[127:64] := a[63:0]
+IF (b[65] == 1) dst[127:64] := a[127:64]
+IF (b[129] == 0) dst[191:128] := a[191:128]
+IF (b[129] == 1) dst[191:128] := a[255:192]
+IF (b[193] == 0) dst[255:192] := a[191:128]
+IF (b[193] == 1) dst[255:192] := a[255:192]
+IF (b[257] == 0) dst[319:256] := a[319:256]
+IF (b[257] == 1) dst[319:256] := a[383:320]
+IF (b[321] == 0) dst[383:320] := a[319:256]
+IF (b[321] == 1) dst[383:320] := a[383:320]
+IF (b[385] == 0) dst[447:384] := a[447:384]
+IF (b[385] == 1) dst[447:384] := a[511:448]
+IF (b[449] == 0) dst[511:448] := a[447:384]
+IF (b[449] == 1) dst[511:448] := a[511:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilps' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
+tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
+tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
+tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
+tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
+tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
+tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
+tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilps' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
+tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
+tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
+tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
+tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
+tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
+tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
+tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilps' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], b[1:0])
+dst[63:32] := SELECT4(a[127:0], b[33:32])
+dst[95:64] := SELECT4(a[127:0], b[65:64])
+dst[127:96] := SELECT4(a[127:0], b[97:96])
+dst[159:128] := SELECT4(a[255:128], b[129:128])
+dst[191:160] := SELECT4(a[255:128], b[161:160])
+dst[223:192] := SELECT4(a[255:128], b[193:192])
+dst[255:224] := SELECT4(a[255:128], b[225:224])
+dst[287:256] := SELECT4(a[383:256], b[257:256])
+dst[319:288] := SELECT4(a[383:256], b[289:288])
+dst[351:320] := SELECT4(a[383:256], b[321:320])
+dst[383:352] := SELECT4(a[383:256], b[353:352])
+dst[415:384] := SELECT4(a[511:384], b[385:384])
+dst[447:416] := SELECT4(a[511:384], b[417:416])
+dst[479:448] := SELECT4(a[511:384], b[449:448])
+dst[511:480] := SELECT4(a[511:384], b[481:480])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermilps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermpd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermpd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermpd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	id := idx[i+3:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[319:256] := SELECT4(a[511:256], imm8[1:0])
+dst[383:320] := SELECT4(a[511:256], imm8[3:2])
+dst[447:384] := SELECT4(a[511:256], imm8[5:4])
+dst[511:448] := SELECT4(a[511:256], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	id := idx[i+2:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpermq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpexpandd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpexpandd' form='zmm {k}, m32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpexpandd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpexpandd' form='zmm {k}, m32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpexpandq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpexpandq' form='zmm {k}, m64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpexpandq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpexpandq' form='zmm {k}, m64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>
+	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpgatherdd' form='zmm {k}, vm32z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>
+	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:16] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpgatherdd' form='zmm {k}, vm32z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpgatherdq' form='zmm {k}, vm32y'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	m := j*32
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:8] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpgatherdq' form='zmm {k}, vm32y'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpgatherqd' form='ymm {k}, vm64z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:8] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpgatherqd' form='ymm {k}, vm64z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpgatherqq' form='zmm {k}, vm64z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:8] := 0
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpgatherqq' form='zmm {k}, vm64z'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	tmp := 31
+	dst[i+31:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+31:i] := dst[i+31:i] + 1
+	OD
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vplzcntd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vplzcntd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vplzcntd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp := 63
+	dst[i+63:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+63:i] := dst[i+63:i] + 1
+	OD
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vplzcntq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vplzcntq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vplzcntq' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxsd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0 
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxsd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF a[i+31:i] &gt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxsd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxsq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxsq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF a[i+63:i] &gt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxsq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxud' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxud' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF a[i+31:i] &gt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxud' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxuq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxuq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF a[i+63:i] &gt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmaxuq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+				dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminsd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminsd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF a[i+31:i] &lt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminsd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminsq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminsq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF a[i+63:i] &lt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminsq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminud' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminud' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF a[i+31:i] &lt; b[i+31:i]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := b[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminud' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminuq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminuq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF a[i+63:i] &lt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpminuq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovdb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_mask_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovdb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate_Int32_To_Int8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovdb' form='m128 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_maskz_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovdb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovdw' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovdw' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate_Int32_To_Int16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovdw' form='m256 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovdw' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovqb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_mask_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovqb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate_Int64_To_Int8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovqb' form='m64 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_maskz_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovqb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovqd' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovqd' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Truncate_Int64_To_Int32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovqd' form='m256 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovqd' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovqw' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_mask_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovqw' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate_Int64_To_Int16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovqw' form='m128 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_maskz_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovqw' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovsdb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_mask_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovsdb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtsepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_Int32_To_Int8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovsdb' form='m128 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_maskz_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovsdb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsdw' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsdw' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtsepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_Int32_To_Int16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovsdw' form='m256 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsdw' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vpmovsqb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_mask_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vpmovsqb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtsepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_Int64_To_Int8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovsqb' form='m64 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_maskz_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vpmovsqb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsqd' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsqd' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtsepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate_Int64_To_Int32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovsqd' form='m256 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovsqd' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovsqw' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_mask_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovsqw' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtsepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_Int64_To_Int16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovsqw' form='m128 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_maskz_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovsqw' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := SignExtend(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxbd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxbd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxbd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := SignExtend(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxbq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxbq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxbq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := SignExtend(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxdq' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxdq' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxdq' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := SignExtend(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxwd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	l := j*16
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxwd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxwd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := SignExtend(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxwq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxwq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovsxwq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovusdb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_mask_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovusdb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtusepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovusdb' form='m128 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_maskz_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovusdb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovusdw' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovusdw' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtusepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovusdw' form='m256 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovusdw' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vpmovusqb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_mask_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vpmovusqb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtusepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovusqb' form='m64 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_maskz_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name='vpmovusqb' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovusqd' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovusqd' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtusepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovusqd' form='m256 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name='vpmovusqd' form='ymm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovusqw' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_mask_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovusqw' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtusepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name='vpmovusqw' form='m128 {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_maskz_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vpmovusqw' form='xmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 8*j
+	dst[i+31:i] := ZeroExtend(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxbd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxbd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxbd' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 8*j
+	dst[i+63:i] := ZeroExtend(a[k+7:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxbq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxbq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxbq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 32*j
+	dst[i+63:i] := ZeroExtend(a[k+31:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxdq' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxdq' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+31:l])
+	ELSE 
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxdq' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	k := 16*j
+	dst[i+31:i] := ZeroExtend(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxwd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxwd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxwd' form='zmm {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	k := 16*j
+	dst[i+63:i] := ZeroExtend(a[k+15:k])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxwq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxwq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmovzxwq' form='zmm {k}, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmuldq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmuldq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmuldq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmulld' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	tmp[63:0] := a[i+31:i] * b[i+31:i]
+	dst[i+31:i] := tmp[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmulld' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmuludq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmuludq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+31:i] * b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpmuludq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpord' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpord' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpord' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_or_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise OR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] OR b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpord' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vporq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vporq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the resut in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vporq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprold' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprold' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprold' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprolq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprolq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprolq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprolvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprolvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprolvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprolvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprolvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprolvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprord' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprord' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprord' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprorq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprorq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprorq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprorvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprorvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprorvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprorvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprorvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vprorvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='vpscatterdd' form='vm32z {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vpscatterdd' form='vm32z {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='vpscatterdq' form='vz32y {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpscatterdq' form='vz32y {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='vpscatterqd' form='vm64z {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpscatterqd' form='vm64z {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='vpscatterqq' form='vm64z {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vpscatterqq' form='vm64z {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="_MM_PERM_ENUM"/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpshufd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="_MM_PERM_ENUM"/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpshufd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="_MM_PERM_ENUM"/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+dst[351:320] := SELECT4(a[383:256], imm8[5:4])
+dst[383:352] := SELECT4(a[383:256], imm8[7:6])
+dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+dst[479:448] := SELECT4(a[511:384], imm8[5:4])
+dst[511:480] := SELECT4(a[511:384], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpshufd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpslld' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpslld' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpslld' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpslld' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpslld' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpslld' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllq' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllq' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllq' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsllvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrad' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrad' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrad' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrad' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := SignBit
+	ELSE
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrad' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := SignBit
+	ELSE
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrad' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsraq' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsraq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsraq' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsraq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := SignBit
+	ELSE
+		dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsraq' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := SignBit
+	ELSE
+		dst[i+63:i] := SignExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsraq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsravd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsravd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsravd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsravq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsravq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsravq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrld' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrld' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrld' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrld' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF count[63:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrld' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF imm8[7:0] &gt; 31
+		dst[i+31:i] := 0
+	ELSE
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrld' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlq' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlq' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlq' form='zmm {k}, zmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := 0
+	ELSE
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlq' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlvd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsrlvq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsubd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsubd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsubd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsubq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsubq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpsubq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpternlogd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpternlogd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	FOR h := 0 to 31
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpternlogd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpternlogq' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpternlogq' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	FOR h := 0 to 63
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpternlogq' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_mask_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vptestmd' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__mmask16" name="_mm512_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vptestmd' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vptestmq' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vptestmq' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_mask_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vptestnmd' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vptestnmd' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vptestnmq' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vptestnmq' form='k {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpckhdq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpckhdq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpckhdq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpckhqdq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpckhqdq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpckhqdq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpckldq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpckldq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpckldq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpcklqdq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpcklqdq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpunpcklqdq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpxord' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpxord' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpxord' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_xor_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise XOR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
+	<operation>
+dst[511:0] := (a[511:0] XOR b[511:0])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpxord' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpxorq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpxorq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpxorq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrcp14pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrcp14pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrcp14pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrcp14ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrcp14ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrcp14ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_rcp14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := APPROXIMATE(1.0/b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrcp14sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_rcp14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := APPROXIMATE(1.0/b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrcp14sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_rcp14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[63:0] := APPROXIMATE(1.0/b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrcp14sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_rcp14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := APPROXIMATE(1.0/b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrcp14ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_rcp14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := APPROXIMATE(1.0/b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrcp14ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_rcp14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[31:0] := APPROXIMATE(1.0/b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrcp14ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscalepd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_roundscale_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscalepd' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscalepd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_roundscale_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscalepd' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst".
+	</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscalepd' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_roundscale_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscalepd' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscaleps' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_roundscale_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscaleps' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscaleps' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_roundscale_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscaleps' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst".
+	</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscaleps' form='zmm {k}, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_roundscale_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrndscaleps' form='zmm {k}, zmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_roundscale_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}		
+
+IF k[0]
+	dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscalesd' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_roundscale_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}		
+
+IF k[0]
+	dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscalesd' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_roundscale_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}		
+
+IF k[0]
+	dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscalesd' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_roundscale_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}		
+
+IF k[0]
+	dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscalesd' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_roundscale_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}		
+
+dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscalesd' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_roundscale_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Round the lower double-precision (64-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst".</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}		
+
+dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscalesd' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_roundscale_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}
+
+IF k[0]
+	dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscaless' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_roundscale_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}
+
+IF k[0]
+	dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscaless' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_roundscale_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}
+
+IF k[0]
+	dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscaless' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_roundscale_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}
+
+IF k[0]
+	dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscaless' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_roundscale_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<parameter varname="rounding" type="const int"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}
+
+dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscaless' form='xmm {k}, xmm, xmm, imm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_roundscale_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Round the lower single-precision (32-bit) floating-point element in "a" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst".</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}
+
+dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrndscaless' form='xmm {k}, xmm, xmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrsqrt14pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrsqrt14pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrsqrt14pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrsqrt14ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrsqrt14ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vrsqrt14ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_rsqrt14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrsqrt14sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_rsqrt14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrsqrt14sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_rsqrt14_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrsqrt14sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_rsqrt14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrsqrt14ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_rsqrt14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+IF k[0]
+	dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrsqrt14ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_rsqrt14_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vrsqrt14ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_scalef_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_scalef_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_scalef_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_scalef_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_scalef_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_scalef_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vscalefps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_scalef_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_scalef_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_scalef_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_scalef_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+IF k[0]
+	dst[63:0] := SCALE(a[63:0], b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_scalef_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+dst[63:0] := SCALE(a[63:0], b[63:0])
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_scalef_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+dst[63:0] := SCALE(a[63:0], b[63:0])
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_scalef_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_scalef_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_scalef_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_scalef_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+
+IF k[0]
+	dst[31:0] := SCALE(a[31:0], b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_scalef_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+
+dst[31:0] := SCALE(a[31:0], b[31:0])
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_scalef_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[63:0]
+}
+
+dst[31:0] := SCALE(a[31:0], b[31:0])
+dst[127:32] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vscalefss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='vscatterdpd' form='vm32y {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vscatterdpd' form='vm32y {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='vscatterdps' form='vm32z {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name='vscatterdps' form='vm32z {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name='vscatterqpd' form='vm32z {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vscatterqpd' form='vm32z {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name='vscatterqps' form='vm32z {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="scale" type="int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name='vscatterqps' form='vm32z {k}, ymm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshuff32x4' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshuff32x4' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshuff32x4' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshuff64x2' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshuff64x2' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshuff64x2' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_shuffle_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufi32x4' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_shuffle_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufi32x4' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_shuffle_i32x4">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufi32x4' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_shuffle_i64x2">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufi64x2' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_shuffle_i64x2">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufi64x2' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_shuffle_i64x2">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	2:	tmp[127:0] := src[383:256]
+	3:	tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT4(a[511:0], imm8[1:0])
+dst[255:128] := SELECT4(a[511:0], imm8[3:2])
+dst[383:256] := SELECT4(b[511:0], imm8[5:4])
+dst[511:384] := SELECT4(b[511:0], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufi64x2' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
+tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
+tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
+tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufpd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
+tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
+tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
+tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufpd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". </description>
+	<operation>
+dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
+dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
+dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
+dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufpd' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufps' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
+tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
+tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
+tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufps' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+dst[287:256] := SELECT4(a[383:256], imm8[1:0])
+dst[319:288] := SELECT4(a[383:256], imm8[3:2])
+dst[351:320] := SELECT4(b[383:256], imm8[5:4])
+dst[383:352] := SELECT4(b[383:256], imm8[7:6])
+dst[415:384] := SELECT4(a[511:384], imm8[1:0])
+dst[447:416] := SELECT4(a[511:384], imm8[3:2])
+dst[479:448] := SELECT4(b[511:384], imm8[5:4])
+dst[511:480] := SELECT4(b[511:384], imm8[7:6])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vshufps' form='zmm {k}, zmm, zmm, imm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtpd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_sqrt_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtpd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtpd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_sqrt_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note].</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtpd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtpd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_sqrt_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note].</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtpd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_sqrt_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_sqrt_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_sqrt_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note].</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsqrtps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_sqrt_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsqrtsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_sqrt_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsqrtsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_sqrt_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsqrtsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_sqrt_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := SQRT(b[63:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsqrtsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_sqrt_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := SQRT(b[63:0])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsqrtsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_sqrt_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsqrtss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_sqrt_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsqrtss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_sqrt_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsqrtss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_sqrt_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := SQRT(b[31:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsqrtss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_sqrt_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := SQRT(b[31:0])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsqrtss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_sub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_sub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description> =
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_sub_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] - b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubpd' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_sub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_sub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_sub_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] - b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vsubps' form='zmm {k}, zmm, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_sub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsubsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_sub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsubsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_sub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsubsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_sub_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".</description>
+	<operation>
+IF k[0]
+	dst[63:0] := a[63:0] - b[63:0]
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsubsd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_sub_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[63:0] := a[63:0] - b[63:0]
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsubsd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_sub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsubss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_sub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsubss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_sub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]
+	</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsubss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_sub_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
+	<operation>
+IF k[0]
+	dst[31:0] := a[31:0] - b[31:0]
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsubss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_sub_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	[round_note]
+	</description>
+	<operation>
+dst[31:0] := a[31:0] - b[31:0]
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name='vsubss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpckhpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpckhpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpckhpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpckhps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpckhps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpckhps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpcklpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpcklpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpcklpd' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpcklps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpcklps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vunpcklps' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_castpd128_pd512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_castpd256_pd512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m256d"/>
+	<description>Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm512_castpd512_pd128">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Cast vector of type __m512d to type __m128d. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm512_castps512_ps128">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Cast vector of type __m512 to type __m128. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm512_castpd512_pd256">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Cast vector of type __m512d to type __m256d. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_castpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Cast vector of type __m512d to type __m512.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_castpd_si512">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Cast vector of type __m512d to type __m512i.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_castps128_ps512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_castps256_ps512">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m256"/>
+	<description>Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_castps512_ps256">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Cast vector of type __m512 to type __m256. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_castps_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Cast vector of type __m512 to type __m512d.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_castps_si512">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Cast vector of type __m512 to type __m512i.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_castsi128_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. 
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_castsi256_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined.
+	 This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_castsi512_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Cast vector of type __m512i to type __m512d.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_castsi512_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Cast vector of type __m512i to type __m512.
+	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_castsi512_si128">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Cast vector of type __m512i to type __m128i.
+	 This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_castsi512_si256">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Cast vector of type __m512i to type __m256i.
+	 This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' rettype='__m512d' name='_mm512_zextpd128_pd512'>
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m128d'/>
+	<description>Casts vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' rettype='__m512' name='_mm512_zextps128_ps512'>
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m128'/>
+	<description>Casts vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' rettype='__m512i' name='_mm512_zextsi128_si512'>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m128i'/>
+	<description>Casts vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' rettype='__m512d' name='_mm512_zextpd256_pd512'>
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256d'/>
+	<description>Casts vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' rettype='__m512' name='_mm512_zextps256_ps512'>
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256'/>
+	<description>Casts vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' rettype='__m512i' name='_mm512_zextsi256_si512'>
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Cast</category>
+	<parameter varname='a' type='__m256i'/>
+	<description>Casts vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
+	</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_mask_reduce_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+sum[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		sum[31:0] := sum[31:0] + a[i+31:i]
+	FI
+ENDFOR
+RETURN sum[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_mask_reduce_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+sum[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		sum[63:0] := sum[63:0] + a[i+63:i]
+	FI
+ENDFOR
+RETURN sum[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="double" name="_mm512_mask_reduce_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+sum[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		sum[63:0] := sum[63:0] + a[i+63:i]
+	FI
+ENDFOR
+RETURN sum[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="float" name="_mm512_mask_reduce_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a".</description>
+	<operation>
+sum[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		sum[31:0] := sum[31:0] + a[i+31:i]
+	FI
+ENDFOR
+RETURN sum[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_mask_reduce_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a".</description>
+	<operation>
+reduced[31:0] := 0xFFFFFFFF
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		reduced[31:0] := reduced[31:0] AND a[i+31:i]
+	FI
+ENDFOR
+RETURN reduced[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_mask_reduce_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a".</description>
+	<operation>
+reduced[63:0] := 0xFFFFFFFFFFFFFFFF
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		reduced[63:0] := reduced[63:0] AND a[i+63:i]
+	FI
+ENDFOR
+RETURN reduced[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_mask_reduce_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+max[31:0] := MIN_INT
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
+	FI
+ENDFOR
+RETURN max[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_mask_reduce_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+max[63:0] := MIN_INT
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
+	FI
+ENDFOR
+RETURN max[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="unsigned int" name="_mm512_mask_reduce_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+max[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
+	FI
+ENDFOR
+RETURN max[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="unsigned __int64" name="_mm512_mask_reduce_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed unsigned 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+max[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
+	FI
+ENDFOR
+RETURN max[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="double" name="_mm512_mask_reduce_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+max[63:0] := MIN_DOUBLE
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
+	FI
+ENDFOR
+RETURN max[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="float" name="_mm512_mask_reduce_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a".</description>
+	<operation>
+max[31:0] := MIN_FLOAT
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
+	FI
+ENDFOR
+RETURN max[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_mask_reduce_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+min[31:0] := MAX_INT
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		min[31:0] := MINIMUM(min[31:0], a[i+31:i])
+	FI
+ENDFOR
+RETURN min[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_mask_reduce_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+min[63:0] := MAX_INT
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		min[63:0] := MINIMUM(min[63:0], a[i+63:i])
+	FI
+ENDFOR
+RETURN min[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="unsigned int" name="_mm512_mask_reduce_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+min[31:0] := MAX_UINT
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		min[31:0] := MINIMUM(min[31:0], a[i+31:i])
+	FI
+ENDFOR
+RETURN min[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="unsigned __int64" name="_mm512_mask_reduce_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed unsigned 64-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+min[63:0] := MAX_UINT
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		min[63:0] := MINIMUM(min[63:0], a[i+63:i])
+	FI
+ENDFOR
+RETURN min[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="double" name="_mm512_mask_reduce_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+min[63:0] := MAX_DOUBLE
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		min[63:0] := MINIMUM(min[63:0], a[i+63:i])
+	FI
+ENDFOR
+RETURN min[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="float" name="_mm512_mask_reduce_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a".</description>
+	<operation>
+min[31:0] := MAX_FLOAT
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		min[31:0] := MINIMUM(min[31:0], a[i+31:i])
+	FI
+ENDFOR
+RETURN min[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_mask_reduce_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+prod[31:0] := 1
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		prod[31:0] := prod[31:0] * a[i+31:i]
+	FI
+ENDFOR
+RETURN prod[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_mask_reduce_mul_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+prod[63:0] := 1
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		prod[63:0] := prod[63:0] * a[i+63:i]
+	FI
+ENDFOR
+RETURN prod[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="double" name="_mm512_mask_reduce_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+prod[63:0] := 1
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		prod[63:0] := prod[63:0] * a[i+63:i]
+	FI
+ENDFOR
+RETURN prod[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="float" name="_mm512_mask_reduce_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a".</description>
+	<operation>
+prod[31:0] := 1
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		prod[31:0] := prod[31:0] * a[i+31:i]
+	FI
+ENDFOR
+RETURN prod[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_mask_reduce_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a".</description>
+	<operation>
+reduced[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		reduced[31:0] := reduced[31:0] OR a[i+31:i]
+	FI
+ENDFOR
+RETURN reduced[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_mask_reduce_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a".</description>
+	<operation>
+reduced[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		reduced[63:0] := reduced[63:0] OR a[i+63:i]
+	FI
+ENDFOR
+RETURN reduced[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_reduce_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+sum[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	sum[31:0] := sum[31:0] + a[i+31:i]
+ENDFOR
+RETURN sum[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_reduce_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+sum[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	sum[63:0] := sum[63:0] + a[i+63:i]
+ENDFOR
+RETURN sum[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="double" name="_mm512_reduce_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+sum[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	sum[63:0] := sum[63:0] + a[i+63:i]
+ENDFOR
+RETURN sum[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="float" name="_mm512_reduce_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a".</description>
+	<operation>
+sum[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	sum[31:0] := sum[31:0] + a[i+31:i]
+ENDFOR
+RETURN sum[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_reduce_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a".</description>
+	<operation>
+reduced[31:0] := 0xFFFFFFFF
+FOR j := 0 to 15
+	i := j*32
+	reduced[31:0] := reduced[31:0] AND a[i+31:i]
+ENDFOR
+RETURN reduced[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_reduce_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a".</description>
+	<operation>
+reduced[63:0] := 0xFFFFFFFFFFFFFFFF
+FOR j := 0 to 7
+	i := j*64
+	reduced[63:0] := reduced[63:0] AND a[i+63:i]
+ENDFOR
+RETURN reduced[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_reduce_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+max[31:0] := MIN_INT
+FOR j := 0 to 15
+	i := j*32
+	max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
+ENDFOR
+RETURN max[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_reduce_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+max[63:0] := MIN_INT
+FOR j := 0 to 7
+	i := j*64
+	max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
+ENDFOR
+RETURN max[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="unsigned int" name="_mm512_reduce_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed unsigned 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+max[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
+ENDFOR
+RETURN max[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="unsigned __int64" name="_mm512_reduce_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed unsigned 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+max[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
+ENDFOR
+RETURN max[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="double" name="_mm512_reduce_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+max[63:0] := MIN_DOUBLE
+FOR j := 0 to 7
+	i := j*64
+	max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
+ENDFOR
+RETURN max[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="float" name="_mm512_reduce_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a".</description>
+	<operation>
+max[31:0] := MIN_FLOAT
+FOR j := 0 to 15
+	i := j*32
+	max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
+ENDFOR
+RETURN max[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_reduce_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+min[31:0] := MAX_INT
+FOR j := 0 to 15
+	i := j*32
+	min[31:0] := MINIMUM(min[31:0], a[i+31:i])
+ENDFOR
+RETURN min[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_reduce_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+min[63:0] := MAX_INT
+FOR j := 0 to 7
+	i := j*64
+	min[63:0] := MINIMUM(min[63:0], a[i+63:i])
+ENDFOR
+RETURN min[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="unsigned int" name="_mm512_reduce_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed unsigned 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+min[31:0] := MAX_UINT
+FOR j := 0 to 15
+	i := j*32
+	min[31:0] := MINIMUM(min[31:0], a[i+31:i])
+ENDFOR
+RETURN min[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="unsigned __int64" name="_mm512_reduce_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed unsigned 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+min[63:0] := MAX_UINT
+FOR j := 0 to 7
+	i := j*64
+	min[63:0] := MINIMUM(min[63:0], a[i+63:i])
+ENDFOR
+RETURN min[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="double" name="_mm512_reduce_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+min[63:0] := MAX_DOUBLE
+FOR j := 0 to 7
+	i := j*64
+	min[63:0] := MINIMUM(min[63:0], a[i+63:i])
+ENDFOR
+RETURN min[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="float" name="_mm512_reduce_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a".</description>
+	<operation>
+min[31:0] := MAX_INT
+FOR j := 0 to 15
+	i := j*32
+	min[31:0] := MINIMUM(min[31:0], a[i+31:i])
+ENDFOR
+RETURN min[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_reduce_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+prod[31:0] := 1
+FOR j := 0 to 15
+	i := j*32
+	prod[31:0] := prod[31:0] * a[i+31:i]
+ENDFOR
+RETURN prod[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_reduce_mul_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+prod[63:0] := 1
+FOR j := 0 to 7
+	i := j*64
+	prod[63:0] := prod[63:0] * a[i+63:i]
+ENDFOR
+RETURN prod[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="double" name="_mm512_reduce_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+prod[63:0] := 1
+FOR j := 0 to 7
+	i := j*64
+	prod[63:0] := prod[63:0] * a[i+63:i]
+ENDFOR
+RETURN prod[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="float" name="_mm512_reduce_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a".</description>
+	<operation>
+prod[31:0] := 1
+FOR j := 0 to 15
+	i := j*32
+	prod[31:0] := prod[31:0] * a[i+31:i]
+ENDFOR
+RETURN prod[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="int" name="_mm512_reduce_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 32-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a".</description>
+	<operation>
+reduced[31:0] := 0
+FOR j := 0 to 15
+	i := j*32
+	reduced[31:0] := reduced[31:0] OR a[i+31:i]
+ENDFOR
+RETURN reduced[31:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" sequence="true" rettype="__int64" name="_mm512_reduce_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Reduce the packed 64-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a".</description>
+	<operation>
+reduced[63:0] := 0
+FOR j := 0 to 7
+	i := j*64
+	reduced[63:0] := reduced[63:0] OR a[i+63:i]
+ENDFOR
+RETURN reduced[63:0]
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512d" name="_mm512_set1_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="a" type="double"/>
+	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512" name="_mm512_set1_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="a" type="float"/>
+	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512i" name="_mm512_set4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="d" type="int"/>
+	<parameter varname="c" type="int"/>
+	<parameter varname="b" type="int"/>
+	<parameter varname="a" type="int"/>
+	<description>Set packed 32-bit integers in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[31:0] := d
+dst[63:32] := c
+dst[95:64] := b
+dst[127:96] := a
+dst[159:128] := d
+dst[191:160] := c
+dst[223:192] := b
+dst[255:224] := a
+dst[287:256] := d
+dst[319:288] := c
+dst[351:320] := b
+dst[383:352] := a
+dst[415:384] := d
+dst[447:416] := c
+dst[479:448] := b
+dst[511:480] := a
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512i" name="_mm512_set4_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="d" type="__int64"/>
+	<parameter varname="c" type="__int64"/>
+	<parameter varname="b" type="__int64"/>
+	<parameter varname="a" type="__int64"/>
+	<description>Set packed 64-bit integers in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[63:0] := d
+dst[127:64] := c
+dst[191:128] := b
+dst[255:192] := a
+dst[319:256] := d
+dst[383:320] := c
+dst[447:384] := b
+dst[511:448] := a
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512d" name="_mm512_set4_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="d" type="double"/>
+	<parameter varname="c" type="double"/>
+	<parameter varname="b" type="double"/>
+	<parameter varname="a" type="double"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[63:0] := d
+dst[127:64] := c
+dst[191:128] := b
+dst[255:192] := a
+dst[319:256] := d
+dst[383:320] := c
+dst[447:384] := b
+dst[511:448] := a
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512" name="_mm512_set4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="d" type="float"/>
+	<parameter varname="c" type="float"/>
+	<parameter varname="b" type="float"/>
+	<parameter varname="a" type="float"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence.</description>
+	<operation>
+dst[31:0] := d
+dst[63:32] := c
+dst[95:64] := b
+dst[127:96] := a
+dst[159:128] := d
+dst[191:160] := c
+dst[223:192] := b
+dst[255:224] := a
+dst[287:256] := d
+dst[319:288] := c
+dst[351:320] := b
+dst[383:352] := a
+dst[415:384] := d
+dst[447:416] := c
+dst[479:448] := b
+dst[511:480] := a
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512i" name="_mm512_set_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname='e63' type='char'/>
+	<parameter varname='e62' type='char'/>
+	<parameter varname='e61' type='char'/>
+	<parameter varname='e60' type='char'/>
+	<parameter varname='e59' type='char'/>
+	<parameter varname='e58' type='char'/>
+	<parameter varname='e57' type='char'/>
+	<parameter varname='e56' type='char'/>
+	<parameter varname='e55' type='char'/>
+	<parameter varname='e54' type='char'/>
+	<parameter varname='e53' type='char'/>
+	<parameter varname='e52' type='char'/>
+	<parameter varname='e51' type='char'/>
+	<parameter varname='e50' type='char'/>
+	<parameter varname='e49' type='char'/>
+	<parameter varname='e48' type='char'/>
+	<parameter varname='e47' type='char'/>
+	<parameter varname='e46' type='char'/>
+	<parameter varname='e45' type='char'/>
+	<parameter varname='e44' type='char'/>
+	<parameter varname='e43' type='char'/>
+	<parameter varname='e42' type='char'/>
+	<parameter varname='e41' type='char'/>
+	<parameter varname='e40' type='char'/>
+	<parameter varname='e39' type='char'/>
+	<parameter varname='e38' type='char'/>
+	<parameter varname='e37' type='char'/>
+	<parameter varname='e36' type='char'/>
+	<parameter varname='e35' type='char'/>
+	<parameter varname='e34' type='char'/>
+	<parameter varname='e33' type='char'/>
+	<parameter varname='e32' type='char'/>
+	<parameter varname='e31' type='char'/>
+	<parameter varname='e30' type='char'/>
+	<parameter varname='e29' type='char'/>
+	<parameter varname='e28' type='char'/>
+	<parameter varname='e27' type='char'/>
+	<parameter varname='e26' type='char'/>
+	<parameter varname='e25' type='char'/>
+	<parameter varname='e24' type='char'/>
+	<parameter varname='e23' type='char'/>
+	<parameter varname='e22' type='char'/>
+	<parameter varname='e21' type='char'/>
+	<parameter varname='e20' type='char'/>
+	<parameter varname='e19' type='char'/>
+	<parameter varname='e18' type='char'/>
+	<parameter varname='e17' type='char'/>
+	<parameter varname='e16' type='char'/>
+	<parameter varname='e15' type='char'/>
+	<parameter varname='e14' type='char'/>
+	<parameter varname='e13' type='char'/>
+	<parameter varname='e12' type='char'/>
+	<parameter varname='e11' type='char'/>
+	<parameter varname='e10' type='char'/>
+	<parameter varname='e9' type='char'/>
+	<parameter varname='e8' type='char'/>
+	<parameter varname='e7' type='char'/>
+	<parameter varname='e6' type='char'/>
+	<parameter varname='e5' type='char'/>
+	<parameter varname='e4' type='char'/>
+	<parameter varname='e3' type='char'/>
+	<parameter varname='e2' type='char'/>
+	<parameter varname='e1' type='char'/>
+	<parameter varname='e0' type='char'/>
+	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[7:0] := e0
+dst[15:8] := e1
+dst[23:16] := e2
+dst[31:24] := e3
+dst[39:32] := e4
+dst[47:40] := e5
+dst[55:48] := e6
+dst[63:56] := e7
+dst[71:64] := e8
+dst[79:72] := e9
+dst[87:80] := e10
+dst[95:88] := e11
+dst[103:96] := e12
+dst[111:104] := e13
+dst[119:112] := e14
+dst[127:120] := e15
+dst[135:128] := e16
+dst[143:136] := e17
+dst[151:144] := e18
+dst[159:152] := e19
+dst[167:160] := e20
+dst[175:168] := e21
+dst[183:176] := e22
+dst[191:184] := e23
+dst[199:192] := e24
+dst[207:200] := e25
+dst[215:208] := e26
+dst[223:216] := e27
+dst[231:224] := e28
+dst[239:232] := e29
+dst[247:240] := e30
+dst[255:248] := e31
+dst[263:256] := e32
+dst[271:264] := e33
+dst[279:272] := e34
+dst[287:280] := e35
+dst[295:288] := e36
+dst[303:296] := e37
+dst[311:304] := e38
+dst[319:312] := e39
+dst[327:320] := e40
+dst[335:328] := e41
+dst[343:336] := e42
+dst[351:344] := e43
+dst[359:352] := e44
+dst[367:360] := e45
+dst[375:368] := e46
+dst[383:376] := e47
+dst[391:384] := e48
+dst[399:392] := e49
+dst[407:400] := e50
+dst[415:408] := e51
+dst[423:416] := e52
+dst[431:424] := e53
+dst[439:432] := e54
+dst[447:440] := e55
+dst[455:448] := e56
+dst[463:456] := e57
+dst[471:464] := e58
+dst[479:472] := e59
+dst[487:480] := e60
+dst[495:488] := e61
+dst[503:496] := e62
+dst[511:503] := e63
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512i" name="_mm512_set_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname='e31' type='short'/>
+	<parameter varname='e30' type='short'/>
+	<parameter varname='e29' type='short'/>
+	<parameter varname='e28' type='short'/>
+	<parameter varname='e27' type='short'/>
+	<parameter varname='e26' type='short'/>
+	<parameter varname='e25' type='short'/>
+	<parameter varname='e24' type='short'/>
+	<parameter varname='e23' type='short'/>
+	<parameter varname='e22' type='short'/>
+	<parameter varname='e21' type='short'/>
+	<parameter varname='e20' type='short'/>
+	<parameter varname='e19' type='short'/>
+	<parameter varname='e18' type='short'/>
+	<parameter varname='e17' type='short'/>
+	<parameter varname='e16' type='short'/>
+	<parameter varname='e15' type='short'/>
+	<parameter varname='e14' type='short'/>
+	<parameter varname='e13' type='short'/>
+	<parameter varname='e12' type='short'/>
+	<parameter varname='e11' type='short'/>
+	<parameter varname='e10' type='short'/>
+	<parameter varname='e9' type='short'/>
+	<parameter varname='e8' type='short'/>
+	<parameter varname='e7' type='short'/>
+	<parameter varname='e6' type='short'/>
+	<parameter varname='e5' type='short'/>
+	<parameter varname='e4' type='short'/>
+	<parameter varname='e3' type='short'/>
+	<parameter varname='e2' type='short'/>
+	<parameter varname='e1' type='short'/>
+	<parameter varname='e0' type='short'/>
+	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[15:0] := e0
+dst[31:16] := e1
+dst[47:32] := e2
+dst[63:48] := e3
+dst[79:64] := e4
+dst[95:80] := e5
+dst[111:96] := e6
+dst[127:112] := e7
+dst[145:128] := e8
+dst[159:144] := e9
+dst[175:160] := e10
+dst[191:176] := e11
+dst[207:192] := e12
+dst[223:208] := e13
+dst[239:224] := e14
+dst[255:240] := e15
+dst[271:256] := e16
+dst[287:272] := e17
+dst[303:288] := e18
+dst[319:304] := e19
+dst[335:320] := e20
+dst[351:336] := e21
+dst[367:352] := e22
+dst[383:368] := e23
+dst[399:384] := e24
+dst[415:400] := e25
+dst[431:416] := e26
+dst[447:432] := e27
+dst[463:448] := e28
+dst[479:464] := e29
+dst[495:480] := e30
+dst[511:496] := e31
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512i" name="_mm512_set_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="e15" type="int"/>
+	<parameter varname="e14" type="int"/>
+	<parameter varname="e13" type="int"/>
+	<parameter varname="e12" type="int"/>
+	<parameter varname="e11" type="int"/>
+	<parameter varname="e10" type="int"/>
+	<parameter varname="e9" type="int"/>
+	<parameter varname="e8" type="int"/>
+	<parameter varname="e7" type="int"/>
+	<parameter varname="e6" type="int"/>
+	<parameter varname="e5" type="int"/>
+	<parameter varname="e4" type="int"/>
+	<parameter varname="e3" type="int"/>
+	<parameter varname="e2" type="int"/>
+	<parameter varname="e1" type="int"/>
+	<parameter varname="e0" type="int"/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[287:256] := e8
+dst[319:288] := e9
+dst[351:320] := e10
+dst[383:352] := e11
+dst[415:384] := e12
+dst[447:416] := e13
+dst[479:448] := e14
+dst[511:480] := e15
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512i" name="_mm512_set_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="e7" type="__int64"/>
+	<parameter varname="e6" type="__int64"/>
+	<parameter varname="e5" type="__int64"/>
+	<parameter varname="e4" type="__int64"/>
+	<parameter varname="e3" type="__int64"/>
+	<parameter varname="e2" type="__int64"/>
+	<parameter varname="e1" type="__int64"/>
+	<parameter varname="e0" type="__int64"/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[319:256] := e4
+dst[383:320] := e5
+dst[447:384] := e6
+dst[511:448] := e7
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512d" name="_mm512_set_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="e7" type="double"/>
+	<parameter varname="e6" type="double"/>
+	<parameter varname="e5" type="double"/>
+	<parameter varname="e4" type="double"/>
+	<parameter varname="e3" type="double"/>
+	<parameter varname="e2" type="double"/>
+	<parameter varname="e1" type="double"/>
+	<parameter varname="e0" type="double"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[63:0] := e0
+dst[127:64] := e1
+dst[191:128] := e2
+dst[255:192] := e3
+dst[319:256] := e4
+dst[383:320] := e5
+dst[447:384] := e6
+dst[511:448] := e7
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512" name="_mm512_set_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="e15" type="float"/>
+	<parameter varname="e14" type="float"/>
+	<parameter varname="e13" type="float"/>
+	<parameter varname="e12" type="float"/>
+	<parameter varname="e11" type="float"/>
+	<parameter varname="e10" type="float"/>
+	<parameter varname="e9" type="float"/>
+	<parameter varname="e8" type="float"/>
+	<parameter varname="e7" type="float"/>
+	<parameter varname="e6" type="float"/>
+	<parameter varname="e5" type="float"/>
+	<parameter varname="e4" type="float"/>
+	<parameter varname="e3" type="float"/>
+	<parameter varname="e2" type="float"/>
+	<parameter varname="e1" type="float"/>
+	<parameter varname="e0" type="float"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values.</description>
+	<operation>
+dst[31:0] := e0
+dst[63:32] := e1
+dst[95:64] := e2
+dst[127:96] := e3
+dst[159:128] := e4
+dst[191:160] := e5
+dst[223:192] := e6
+dst[255:224] := e7
+dst[287:256] := e8
+dst[319:288] := e9
+dst[351:320] := e10
+dst[383:352] := e11
+dst[415:384] := e12
+dst[447:416] := e13
+dst[479:448] := e14
+dst[511:480] := e15
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512i" name="_mm512_setr4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="d" type="int"/>
+	<parameter varname="c" type="int"/>
+	<parameter varname="b" type="int"/>
+	<parameter varname="a" type="int"/>
+	<description>Set packed 32-bit integers in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[31:0] := a
+dst[63:32] := b
+dst[95:64] := c
+dst[127:96] := d
+dst[159:128] := a
+dst[191:160] := b
+dst[223:192] := c
+dst[255:224] := d
+dst[287:256] := a
+dst[319:288] := b
+dst[351:320] := c
+dst[383:352] := d
+dst[415:384] := a
+dst[447:416] := b
+dst[479:448] := c
+dst[511:480] := d
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512i" name="_mm512_setr4_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="d" type="__int64"/>
+	<parameter varname="c" type="__int64"/>
+	<parameter varname="b" type="__int64"/>
+	<parameter varname="a" type="__int64"/>
+	<description>Set packed 64-bit integers in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[63:0] := a
+dst[127:64] := b
+dst[191:128] := c
+dst[255:192] := d
+dst[319:256] := a
+dst[383:320] := b
+dst[447:384] := c
+dst[511:448] := d
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512d" name="_mm512_setr4_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="d" type="double"/>
+	<parameter varname="c" type="double"/>
+	<parameter varname="b" type="double"/>
+	<parameter varname="a" type="double"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[63:0] := a
+dst[127:64] := b
+dst[191:128] := c
+dst[255:192] := d
+dst[319:256] := a
+dst[383:320] := b
+dst[447:384] := c
+dst[511:448] := d
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512" name="_mm512_setr4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="d" type="float"/>
+	<parameter varname="c" type="float"/>
+	<parameter varname="b" type="float"/>
+	<parameter varname="a" type="float"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order.</description>
+	<operation>
+dst[31:0] := a
+dst[63:32] := b
+dst[95:64] := c
+dst[127:96] := d
+dst[159:128] := a
+dst[191:160] := b
+dst[223:192] := c
+dst[255:224] := d
+dst[287:256] := a
+dst[319:288] := b
+dst[351:320] := c
+dst[383:352] := d
+dst[415:384] := a
+dst[447:416] := b
+dst[479:448] := c
+dst[511:480] := d
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512i" name="_mm512_setr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="e15" type="int"/>
+	<parameter varname="e14" type="int"/>
+	<parameter varname="e13" type="int"/>
+	<parameter varname="e12" type="int"/>
+	<parameter varname="e11" type="int"/>
+	<parameter varname="e10" type="int"/>
+	<parameter varname="e9" type="int"/>
+	<parameter varname="e8" type="int"/>
+	<parameter varname="e7" type="int"/>
+	<parameter varname="e6" type="int"/>
+	<parameter varname="e5" type="int"/>
+	<parameter varname="e4" type="int"/>
+	<parameter varname="e3" type="int"/>
+	<parameter varname="e2" type="int"/>
+	<parameter varname="e1" type="int"/>
+	<parameter varname="e0" type="int"/>
+	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e15
+dst[63:32] := e14
+dst[95:64] := e13
+dst[127:96] := e12
+dst[159:128] := e11
+dst[191:160] := e10
+dst[223:192] := e9
+dst[255:224] := e8
+dst[287:256] := e7
+dst[319:288] := e6
+dst[351:320] := e5
+dst[383:352] := e4
+dst[415:384] := e3
+dst[447:416] := e2
+dst[479:448] := e1
+dst[511:480] := e0
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512i" name="_mm512_setr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="e7" type="__int64"/>
+	<parameter varname="e6" type="__int64"/>
+	<parameter varname="e5" type="__int64"/>
+	<parameter varname="e4" type="__int64"/>
+	<parameter varname="e3" type="__int64"/>
+	<parameter varname="e2" type="__int64"/>
+	<parameter varname="e1" type="__int64"/>
+	<parameter varname="e0" type="__int64"/>
+	<description>Set packed 64-bit integers in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e7
+dst[127:64] := e6
+dst[191:128] := e5
+dst[255:192] := e4
+dst[319:256] := e3
+dst[383:320] := e2
+dst[447:384] := e1
+dst[511:448] := e0
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512d" name="_mm512_setr_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="e7" type="double"/>
+	<parameter varname="e6" type="double"/>
+	<parameter varname="e5" type="double"/>
+	<parameter varname="e4" type="double"/>
+	<parameter varname="e3" type="double"/>
+	<parameter varname="e2" type="double"/>
+	<parameter varname="e1" type="double"/>
+	<parameter varname="e0" type="double"/>
+	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[63:0] := e7
+dst[127:64] := e6
+dst[191:128] := e5
+dst[255:192] := e4
+dst[319:256] := e3
+dst[383:320] := e2
+dst[447:384] := e1
+dst[511:448] := e0
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" sequence="true" rettype="__m512" name="_mm512_setr_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="e15" type="float"/>
+	<parameter varname="e14" type="float"/>
+	<parameter varname="e13" type="float"/>
+	<parameter varname="e12" type="float"/>
+	<parameter varname="e11" type="float"/>
+	<parameter varname="e10" type="float"/>
+	<parameter varname="e9" type="float"/>
+	<parameter varname="e8" type="float"/>
+	<parameter varname="e7" type="float"/>
+	<parameter varname="e6" type="float"/>
+	<parameter varname="e5" type="float"/>
+	<parameter varname="e4" type="float"/>
+	<parameter varname="e3" type="float"/>
+	<parameter varname="e2" type="float"/>
+	<parameter varname="e1" type="float"/>
+	<parameter varname="e0" type="float"/>
+	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
+	<operation>
+dst[31:0] := e15
+dst[63:32] := e14
+dst[95:64] := e13
+dst[127:96] := e12
+dst[159:128] := e11
+dst[191:160] := e10
+dst[223:192] := e9
+dst[255:224] := e8
+dst[287:256] := e7
+dst[319:288] := e6
+dst[351:320] := e5
+dst[383:352] := e4
+dst[415:384] := e3
+dst[447:416] := e2
+dst[479:448] := e1
+dst[511:480] := e0
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_setzero">
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="" type="void"/>
+	<description>Return vector of type __m512 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='vpxorq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_setzero_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<description>Return vector of type __m512i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='vpxorq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_setzero_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<description>Return vector of type __m512d with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='vpxorq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_setzero_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<description>Return vector of type __m512 with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='vpxorq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_setzero_si512">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<description>Return vector of type __m512i with all elements set to zero.</description>
+	<operation>
+dst[MAX:0] := 0
+	</operation>
+	<instruction name='vpxorq' form='zmm {k}, zmm, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_undefined">
+	<CPUID>AVX512F</CPUID>
+	<category>General Support</category>
+	<parameter varname="" type="void"/>
+	<description>Return vector of type __m512 with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_undefined_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>General Support</category>
+	<description>Return vector of type __m512i with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_undefined_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>General Support</category>
+	<description>Return vector of type __m512d with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_undefined_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>General Support</category>
+	<description>Return vector of type __m512 with undefined elements.</description>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_prefetch_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache. "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j:= 0 to 7
+	i := j*64;
+	Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], hint, RFO=0);
+ENDFOR;
+	</operation>
+	<instruction name='vgatherpf0qps' form='vm64z {k}'/>
+	<instruction name='vgatherpf1qps' form='vm64z {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_prefetch_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="mask" type="__mmask8"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are only brought into cache when their corresponding mask bit is set). "scale" should be 1, 2, 4 or 8.. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j:= 0 to 7
+	i := j*64;
+	IF mask[j] THEN
+		Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], hint, RFO=0);
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vgatherpf0qps' form='vm64z {k}'/>
+	<instruction name='vgatherpf1qps' form='vm64z {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_prefetch_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. Elements are prefetched into cache level "hint", where "hint" is 0 or 1. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1);
+ENDFOR;
+	</operation>
+	<instruction name='vscatterpf0qps' form='vm64z {k}'/>
+	<instruction name='vscatterpf1qps' form='vm64z {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_prefetch_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="mask" type="__mmask8"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF mask[j] THEN
+		Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1);
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vscatterpf0qps' form='vm64z {k}'/>
+	<instruction name='vscatterpf1qps' form='vm64z {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_prefetch_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache. "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32;
+	Prefetch([base_addr + SignExtend(vindex[i*31:i]) * scale], hint, RFO=0);
+ENDFOR;
+	</operation>
+	<instruction name='vgatherpf0dpd' form='vm32y {k}'/>
+	<instruction name='vgatherpf1dpd' form='vm32y {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_prefetch_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="mask" type="__mmask8"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32;
+	IF mask[j] THEN
+		Prefetch([base_addr + SignExtend(vindex[i*31:i]) * scale], hint, RFO=0);
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vgatherpf0dpd' form='vm32y {k}'/>
+	<instruction name='vgatherpf1dpd' form='vm32y {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_prefetch_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="mask" type="__mmask16"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16;
+	IF mask[j] THEN
+		Prefetch([base_addr + SignExtend(vindex[i*31:i]) * scale], hint, RFO=0);
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vgatherpf0dps' form='vm32y {k}'/>
+	<instruction name='vgatherpf1dps' form='vm32y {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_prefetch_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 TO 7
+	i := j*32;
+	Prefetch(base_addr + SignExtend(vindex[i+31:i]) * scale], Level=hint, RFO=1);
+ENDFOR;
+	</operation>
+	<instruction name='vscatterpf0dpd' form='vm32y {k}'/>
+	<instruction name='vscatterpf1dpd' form='vm32y {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_prefetch_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="mask" type="__mmask8"/>
+	<parameter varname="vinde" type="__m256i"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 TO 7
+	i := j*32;
+	IF mask[j] THEN
+		Prefetch(base_addr + SignExtend(vindex[i+31:i]) * scale], Level=hint, RFO=1);
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vscatterpf0dpd' form='vm32y {k}'/>
+	<instruction name='vscatterpf1dpd' form='vm32y {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_prefetch_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by "hint" using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	Prefetch([base_addr + SignExtend(vindex[i*63:i] * scale]), Level=hint, RFO=0);
+ENDFOR;
+	</operation>
+	<instruction name='vgatherpf0qpd' form='vm32z {k}'/>
+	<instruction name='vgatherpf1qpd' form='vm32z {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_prefetch_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Load</category>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="mask" type="__mmask8"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by "hint" using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Prefetched elements are merged in cache using writemask "k" (elements are copied from memory when the corresponding mask bit is set). "scale" should be 1, 2, 4 or 8. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF mask[j] THEN
+		Prefetch([base_addr + SignExtend(vindex[i*63:i] * scale]), Level=hint, RFO=0);
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vgatherpf0qpd' form='vm32z {k}'/>
+	<instruction name='vgatherpf1qpd' form='vm32z {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_prefetch_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1);
+ENDFOR;
+	</operation>
+	<instruction name='vscatterpf0qpd' form='vm32z {k}'/>
+	<instruction name='vscatterpf1qpd' form='vm32z {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_prefetch_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="mask" type="__mmask8"/>
+	<parameter varname="vindex" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not brought into cache when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF mask[j] THEN
+		Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1);
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vscatterpf0qpd' form='vm32z {k}'/>
+	<instruction name='vscatterpf1qpd' form='vm32z {k}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_exp2a23_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_exp2a23_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_exp2a23_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
+	ELSE
+		dst[i*31:i] := src[i*31:i];
+	FI
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_exp2a23_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
+	ELSE
+		dst[i*31:i] := src[i*31:i];
+	FI
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_exp2a23_round_ps">
+	<type>Floating Point</type>
+<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
+	ELSE
+		dst[i*31:i] := 0;
+	FI
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_exp2a23_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
+	ELSE
+		dst[i*31:i] := 0;
+	FI
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_exp2a23_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23. [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2pd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_exp2a23_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_exp2a23_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
+	ELSE
+		dst[i+63:i] := src[i+63:i];
+	FI
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2pd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_exp2a23_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="src" type="__m512d"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
+	ELSE
+		dst[i+63:i] := src[i+63:i];
+	FI
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_exp2a23_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23. [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
+	ELSE
+		dst[i+63:i] := 0;
+	FI
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2pd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_exp2a23_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
+	ELSE
+		dst[i+63:i] := 0;
+	FI
+ENDFOR;
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vexp2pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_rcp28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+dst[63:0] := RCP_28_DP(1.0/b[63:0];
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_rcp28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+dst[63:0] := RCP_28_DP(1.0/b[63:0];
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_rcp28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+IF k[0] THEN
+	dst[63:0] := RCP_28_DP(1.0/b[63:0];
+ELSE
+	dst[63:0] := src[63:0];
+FI
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_rcp28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0] THEN
+	dst[63:0] := RCP_28_DP(1.0/b[63:0];
+ELSE
+	dst[63:0] := src[63:0];
+FI
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_rcp28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+IF k[0] THEN
+	dst[63:0] := RCP_28_DP(1.0/b[63:0];
+ELSE
+	dst[63:0] := 0;
+FI
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_rcp28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0] THEN
+	dst[63:0] := RCP_28_DP(1.0/b[63:0];
+ELSE
+	dst[63:0] := 0;
+FI
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_rcp28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst". The maximum relative error for this approximation is less than 2^-28, and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_note]</description>
+	<operation>
+dst[31:0] := RCP_28_DP(1.0/b[31:0];
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_rcp28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+dst[31:0] := RCP_28_DP(1.0/b[31:0];
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_rcp28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+IF k[0] THEN
+	dst[31:0] := RCP_28_DP(1.0/b[31:0];
+ELSE
+	dst[31:0] := src[31:0];
+FI
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_rcp28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0] THEN
+	dst[31:0] := RCP_28_DP(1.0/b[31:0];
+ELSE
+	dst[31:0] := src[31:0];
+FI
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_rcp28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+IF k[0] THEN
+	dst[31:0] := RCP_28_DP(1.0/b[31:0];
+ELSE
+	dst[31:0] := 0;
+FI
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_rcp28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0] THEN
+	dst[31:0] := RCP_28_DP(1.0/b[31:0];
+ELSE
+	dst[31:0] := 0;
+FI
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrcp28ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_rcp28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_rcp28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_rcp28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
+	ELSE
+		dst[i+31:i] := src[i+31:i];
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_rcp28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
+	ELSE
+		dst[i+31:i] := src[i+31:i];
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_rcp28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
+	ELSE
+		dst[i+31:i] := 0;
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_rcp28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
+	ELSE
+		dst[i+31:i] := 0;
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_rcp28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28pd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_rcp28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_rcp28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
+	ELSE
+		dst[i+63:i] := src[i+63:i];
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28pd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_rcp28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
+	ELSE
+		dst[i+63:i] := src[i+63:i];
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_rcp28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
+	ELSE
+		dst[i+63:i] := 0;
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28pd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_rcp28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
+	ELSE
+		dst[i+63:i] := 0;
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrcp28pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_rsqrt28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+dst[63:0] := (1.0/SQRT(b[63:0]));
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_rsqrt28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+dst[63:0] := (1.0/SQRT(b[63:0]));
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_rsqrt28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+IF k[0] THEN
+	dst[63:0] := (1.0/SQRT(b[63:0]));
+ELSE
+	dst[63:0] := src[63:0];
+FI
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_rsqrt28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0] THEN
+	dst[63:0] := (1.0/SQRT(b[63:0]));
+ELSE
+	dst[63:0] := src[63:0];
+FI
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_rsqrt28_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+IF k[0] THEN
+	dst[63:0] := (1.0/SQRT(b[63:0]));
+ELSE
+	dst[63:0] := 0;
+FI
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28sd' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_rsqrt28_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0] THEN
+	dst[63:0] := (1.0/SQRT(b[63:0]));
+ELSE
+	dst[63:0] := 0;
+FI
+dst[127:64] := a[127:64];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28sd' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_rsqrt28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+dst[31:0] := (1.0/SQRT(b[31:0]));
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_rsqrt28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+dst[31:0] := (1.0/SQRT(b[31:0]));
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_rsqrt28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+IF k[0] THEN
+	dst[31:0] := (1.0/SQRT(b[31:0]));
+ELSE
+	dst[31:0] := src[31:0];
+FI
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_rsqrt28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0] THEN
+	dst[31:0] := (1.0/SQRT(b[31:0]));
+ELSE
+	dst[31:0] := src[31:0];
+FI
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_rsqrt28_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+IF k[0] THEN
+	dst[31:0] := (1.0/SQRT(b[31:0]));
+ELSE
+	dst[31:0] := 0;
+FI
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28ss' form='xmm {k}, xmm, xmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_rsqrt28_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+IF k[0] THEN
+	dst[31:0] := (1.0/SQRT(b[31:0]));
+ELSE
+	dst[31:0] := 0;
+FI
+dst[127:32] := a[127:32];
+dst[MAX:128] := 0;
+	</operation>
+	<instruction name='vrsqrt28ss' form='xmm {k}, xmm, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_rsqrt28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_rsqrt28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_rsqrt28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
+	ELSE
+		dst[i+31:i] := src[i+31:i];
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_rsqrt28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
+	ELSE
+		dst[i+31:i] := src[i+31:i];
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_rsqrt28_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
+	ELSE
+		dst[i+31:i] := 0;
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28ps' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_rsqrt28_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32;
+	IF k[j] THEN
+		dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
+	ELSE
+		dst[i+31:i] := 0;
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28ps' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_rsqrt28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28pd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_rsqrt28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst". The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_rsqrt28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
+	ELSE
+		dst[i+63:i] := src[i+63:i];
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28pd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_rsqrt28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
+	ELSE
+		dst[i+63:i] := src[i+63:i];
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_rsqrt28_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28. [round_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
+	ELSE
+		dst[i+63:i] := 0;
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28pd' form='zmm {k}, zmm {er}'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_rsqrt28_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512ER</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64;
+	IF k[j] THEN
+		dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
+	ELSE
+		dst[i+63:i] := 0;
+	FI
+ENDFOR;
+	</operation>
+	<instruction name='vrsqrt28pd' form='zmm {k}, zmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='int' name='_mm_tzcnt_32'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned int' varname='a' />
+	<description>Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 32) AND a[tmp] = 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD	
+	</operation>
+	<instruction name='tzcnt' form='r32, r32'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="Other" rettype='__int64' name='_mm_tzcnt_64'>
+	<type>Integer</type>
+	<CPUID>BMI1</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter type='unsigned __int64' varname='a' />
+	<description>Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst".</description>
+	<operation>
+tmp := 0
+dst := 0
+DO WHILE ((tmp &lt; 64) AND a[tmp] = 0)
+	tmp := tmp + 1
+	dst := dst + 1
+OD	
+	</operation>
+	<instruction name='tzcnt' form='r64, r64'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_extload_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="bc" type="_MM_BROADCAST32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to single-precision (32-bit) floating-point elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr = MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	CASE bc OF
+	_MM_BROADCAST32_NONE:
+		CASE conv OF
+		_MM_UPCONV_PS_NONE:
+			n	 := j*32
+			dst[i+31:i] := addr[n+31:n]
+		_MM_UPCONV_PS_FLOAT16:
+			n	 := j*16
+			dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
+		_MM_UPCONV_PS_UINT8:
+			n	 := j*8
+			dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
+		_MM_UPCONV_PS_SINT8:
+			n	 := j*8
+			dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
+		_MM_UPCONV_PS_UINT16:
+			n	 := j*16
+			dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
+		_MM_UPCONV_PS_SINT16:
+			n	 := j*16
+			dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
+		ESAC
+	_MM_BROADCAST_1X16:
+		CASE conv OF
+		_MM_UPCONV_PS_NONE:
+			n	 := j*32
+			dst[i+31:i] := addr[31:0]
+		_MM_UPCONV_PS_FLOAT16:
+			n	 := j*16
+			dst[i+31:i] := Float16ToFloat32(addr[15:0])
+		_MM_UPCONV_PS_UINT8:
+			n	 := j*8
+			dst[i+31:i] := UInt8ToFloat32(addr[7:0])
+		_MM_UPCONV_PS_SINT8:
+			n	 := j*8
+			dst[i+31:i] := SInt8ToFloat32(addr[7:0])
+		_MM_UPCONV_PS_UINT16:
+			n	 := j*16
+			dst[i+31:i] := UInt16ToFloat32(addr[15:0])
+		_MM_UPCONV_PS_SINT16:
+			n	 := j*16
+			dst[i+31:i] := SInt16ToFloat32(addr[15:0])
+		ESAC
+	_MM_BROADCAST_4X16:
+		mod := j%4
+		CASE conv OF
+		_MM_UPCONV_PS_NONE:
+			n := mod*32
+			dst[i+31:i] := addr[n+31:n]
+		_MM_UPCONV_PS_FLOAT16:
+			n := mod*16
+			dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
+		_MM_UPCONV_PS_UINT8:
+			n := mod*8
+			dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
+		_MM_UPCONV_PS_SINT8:
+			n := mod*8
+			dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
+		_MM_UPCONV_PS_UINT16:
+			n := mod*16
+			dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
+		_MM_UPCONV_PS_SINT16:
+			n := mod*16
+			dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
+		ESAC
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovaps" form="zmm {k}, m512" xed=""/>
+	<instruction name="vbroadcastf32x4" form="zmm {k}, m512" xed=""/>
+	<instruction name="vbroadcastss" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_extload_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="bc" type="_MM_BROADCAST32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to single-precision (32-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr = MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		CASE bc OF
+		_MM_BROADCAST32_NONE:
+			CASE conv OF
+			_MM_UPCONV_PS_NONE:
+				n	 := j*32
+				dst[i+31:i] := addr[n+31:n]
+			_MM_UPCONV_PS_FLOAT16:
+				n	 := j*16
+				dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
+			_MM_UPCONV_PS_UINT8:
+				n	 := j*8
+				dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
+			_MM_UPCONV_PS_SINT8:
+				n	 := j*8
+				dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
+			_MM_UPCONV_PS_UINT16:
+				n	 := j*16
+				dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
+			_MM_UPCONV_PS_SINT16:
+				n	 := j*16
+				dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
+			ESAC
+		_MM_BROADCAST_1X16:
+			CASE conv OF
+			_MM_UPCONV_PS_NONE:
+				n	 := j*32
+				dst[i+31:i] := addr[31:0]
+			_MM_UPCONV_PS_FLOAT16:
+				n	 := j*16
+				dst[i+31:i] := Float16ToFloat32(addr[15:0])
+			_MM_UPCONV_PS_UINT8:
+				n	 := j*8
+				dst[i+31:i] := UInt8ToFloat32(addr[7:0])
+			_MM_UPCONV_PS_SINT8:
+				n	 := j*8
+				dst[i+31:i] := SInt8ToFloat32(addr[7:0])
+			_MM_UPCONV_PS_UINT16:
+				n	 := j*16
+				dst[i+31:i] := UInt16ToFloat32(addr[15:0])
+			_MM_UPCONV_PS_SINT16:
+				n	 := j*16
+				dst[i+31:i] := SInt16ToFloat32(addr[15:0])
+			ESAC
+		_MM_BROADCAST_4X16:
+			mod := j%4
+			CASE conv OF
+			_MM_UPCONV_PS_NONE:
+				n := mod*32
+				dst[i+31:i] := addr[n+31:n]
+			_MM_UPCONV_PS_FLOAT16:
+				n := mod*16
+				dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
+			_MM_UPCONV_PS_UINT8:
+				n := mod*8
+				dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
+			_MM_UPCONV_PS_SINT8:
+				n := mod*8
+				dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
+			_MM_UPCONV_PS_UINT16:
+				n := mod*16
+				dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
+			_MM_UPCONV_PS_SINT16:
+				n := mod*16
+				dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
+			ESAC
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovaps" form="zmm {k}, m512" xed=""/>
+	<instruction name="vbroadcastf32x4" form="zmm {k}, m512" xed=""/>
+	<instruction name="vbroadcastss" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_extload_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI32_ENUM"/>
+	<parameter varname="bc" type="_MM_BROADCAST32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 32-bit integer elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr = MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	CASE bc OF
+	_MM_BROADCAST32_NONE:
+		CASE conv OF
+		_MM_UPCONV_EPI32_NONE:
+			n	 := j*32
+			dst[i+31:i] := addr[n+31:n]
+		_MM_UPCONV_EPI32_UINT8:
+			n	 := j*8
+			dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
+		_MM_UPCONV_EPI32_SINT8:
+			n	 := j*8
+			dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
+		_MM_UPCONV_EPI32_UINT16:
+			n	 := j*16
+			dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
+		_MM_UPCONV_EPI32_SINT16:
+			n	 := j*16
+			dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
+		ESAC
+	_MM_BROADCAST_1X16:
+		CASE conv OF
+		_MM_UPCONV_EPI32_NONE:
+			n	 := j*32
+			dst[i+31:i] := addr[31:0]
+		_MM_UPCONV_EPI32_UINT8:
+			n	 := j*8
+			dst[i+31:i] := UInt8ToInt32(addr[7:0])
+		_MM_UPCONV_EPI32_SINT8:
+			n	 := j*8
+			dst[i+31:i] := SInt8ToInt32(addr[7:0])
+		_MM_UPCONV_EPI32_UINT16:
+			n	 := j*16
+			dst[i+31:i] := UInt16ToInt32(addr[15:0])
+		_MM_UPCONV_EPI32_SINT16:
+			n	 := j*16
+			dst[i+31:i] := SInt16ToInt32(addr[15:0])
+		ESAC
+	_MM_BROADCAST_4X16:
+		mod := j%4
+		CASE conv OF
+		_MM_UPCONV_EPI32_NONE:
+			n := mod*32
+			dst[i+31:i] := addr[n+31:n]
+		_MM_UPCONV_EPI32_UINT8:
+			n := mod*8
+			dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
+		_MM_UPCONV_EPI32_SINT8:
+			n := mod*8
+			dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
+		_MM_UPCONV_EPI32_UINT16:
+			n := mod*16
+			dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
+		_MM_UPCONV_EPI32_SINT16:
+			n := mod*16
+			dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
+		ESAC
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqa32" form="zmm {k}, m512" xed=""/>
+	<instruction name="vbroadcasti32x4" form="zmm {k}, m512" xed=""/>
+	<instruction name="vpbroadcastd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_extload_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI32_ENUM"/>
+	<parameter varname="bc" type="_MM_BROADCAST32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Depending on "bc", loads 1, 4, or 16 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 32-bit integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr = MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		CASE bc OF
+		_MM_BROADCAST32_NONE:
+			CASE conv OF
+			_MM_UPCONV_EPI32_NONE:
+				n	 := j*32
+				dst[i+31:i] := addr[n+31:n]
+			_MM_UPCONV_EPI32_UINT8:
+				n	 := j*8
+				dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
+			_MM_UPCONV_EPI32_SINT8:
+				n	 := j*8
+				dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
+			_MM_UPCONV_EPI32_UINT16:
+				n	 := j*16
+				dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
+			_MM_UPCONV_EPI32_SINT16:
+				n	 := j*16
+				dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
+			ESAC
+		_MM_BROADCAST_1X16:
+			CASE conv OF
+			_MM_UPCONV_EPI32_NONE:
+				n	 := j*32
+				dst[i+31:i] := addr[31:0]
+			_MM_UPCONV_EPI32_UINT8:
+				n	 := j*8
+				dst[i+31:i] := UInt8ToInt32(addr[7:0])
+			_MM_UPCONV_EPI32_SINT8:
+				n	 := j*8
+				dst[i+31:i] := SInt8ToInt32(addr[7:0])
+			_MM_UPCONV_EPI32_UINT16:
+				n	 := j*16
+				dst[i+31:i] := UInt16ToInt32(addr[15:0])
+			_MM_UPCONV_EPI32_SINT16:
+				n	 := j*16
+				dst[i+31:i] := SInt16ToInt32(addr[15:0])
+			ESAC
+		_MM_BROADCAST_4X16:
+			mod := j%4
+			CASE conv OF
+			_MM_UPCONV_EPI32_NONE:
+				n := mod*32
+				dst[i+31:i] := addr[n+31:n]
+			_MM_UPCONV_EPI32_UINT8:
+				n := mod*8
+				dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
+			_MM_UPCONV_EPI32_SINT8:
+				n := mod*8
+				dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
+			_MM_UPCONV_EPI32_UINT16:
+				n := mod*16
+				dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
+			_MM_UPCONV_EPI32_SINT16:
+				n := mod*16
+				dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
+			ESAC
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqa32" form="zmm {k}, m512" xed=""/>
+	<instruction name="vbroadcasti32x4" form="zmm {k}, m512" xed=""/>
+	<instruction name="vpbroadcastd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_extload_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PD_ENUM"/>
+	<parameter varname="bc" type="_MM_BROADCAST64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to double-precision (64-bit) floating-point elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr = MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	CASE bc OF
+	_MM_BROADCAST64_NONE:
+		CASE conv OF
+		_MM_UPCONV_PD_NONE:
+			n := j*64
+			dst[i+63:i] := addr[n+63:n]
+		ESAC
+	_MM_BROADCAST_1X8:
+		CASE conv OF
+		_MM_UPCONV_PD_NONE:
+			n := j*64
+			dst[i+63:i] := addr[63:0]
+		ESAC
+	_MM_BROADCAST_4X8:
+		mod := j%4
+		CASE conv OF
+		_MM_UPCONV_PD_NONE:
+			n := mod*64
+			dst[i+63:i] := addr[n+63:n]
+		ESAC
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovapd" form="zmm {k}, m512" xed=""/>
+	<instruction name="vbroadcastf64x4" form="zmm {k}, m512" xed=""/>
+	<instruction name="vbroadcastsd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_extload_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PD_ENUM"/>
+	<parameter varname="bc" type="_MM_BROADCAST64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr = MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		CASE bc OF
+		_MM_BROADCAST64_NONE:
+			CASE conv OF
+			_MM_UPCONV_PD_NONE:
+				n := j*64
+				dst[i+63:i] := addr[n+63:n]
+			ESAC
+		_MM_BROADCAST_1X8:
+			CASE conv OF
+			_MM_UPCONV_PD_NONE:
+				n := j*64
+				dst[i+63:i] := addr[63:0]
+			ESAC
+		_MM_BROADCAST_4X8:
+			mod := j%4
+			CASE conv OF
+			_MM_UPCONV_PD_NONE:
+				n := mod*64
+				dst[i+63:i] := addr[n+63:n]
+			ESAC
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovapd" form="zmm {k}, m512" xed=""/>
+	<instruction name="vbroadcastf64x4" form="zmm {k}, m512" xed=""/>
+	<instruction name="vbroadcastsd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_extload_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI64_ENUM"/>
+	<parameter varname="bc" type="_MM_BROADCAST64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 64-bit integer elements, storing the results in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr = MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	CASE bc OF
+	_MM_BROADCAST64_NONE:
+		CASE conv OF
+		_MM_UPCONV_EPI64_NONE:
+			n := j*64
+			dst[i+63:i] := addr[n+63:n]
+		ESAC
+	_MM_BROADCAST_1X8:
+		CASE conv OF
+		_MM_UPCONV_EPI64_NONE:
+			n := j*64
+			dst[i+63:i] := addr[63:0]
+		ESAC
+	_MM_BROADCAST_4X8:
+		mod := j%4
+		CASE conv OF
+		_MM_UPCONV_EPI64_NONE:
+			n := mod*64
+			dst[i+63:i] := addr[n+63:n]
+		ESAC
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqa64" form="zmm {k}, m512" xed=""/>
+	<instruction name="vbroadcasti64x4" form="zmm {k}, m512" xed=""/>
+	<instruction name="vpbroadcastq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_extload_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI64_ENUM"/>
+	<parameter varname="bc" type="_MM_BROADCAST64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Depending on "bc", loads 1, 4, or 8 elements of type and size determined by "conv" from memory address "mt" and converts all elements to 64-bit integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr = MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		CASE bc OF
+		_MM_BROADCAST64_NONE:
+			CASE conv OF
+			_MM_UPCONV_EPI64_NONE:
+				n := j*64
+				dst[i+63:i] := addr[n+63:n]
+			ESAC
+		_MM_BROADCAST_1X8:
+			CASE conv OF
+			_MM_UPCONV_EPI64_NONE:
+				n := j*64
+				dst[i+63:i] := addr[63:0]
+			ESAC
+		_MM_BROADCAST_4X8:
+			mod := j%4
+			CASE conv OF
+			_MM_UPCONV_EPI64_NONE:
+				n := mod*64
+				dst[i+63:i] := addr[n+63:n]
+			ESAC
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqa64" form="m512 {k}, zmm" xed=""/>
+	<instruction name="vbroadcasti64x4" form="zmm {k}, m512" xed=""/>
+	<instruction name="vpbroadcastq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_swizzle_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="v" type="__m512"/>
+	<parameter varname="s" type="_MM_SWIZZLE_ENUM"/>
+	<description>Performs a swizzle transformation of each of the four groups of packed 4xsingle-precision (32-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst".</description>
+	<operation>
+CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 7
+		i := j*64
+		dst[i+31:i]    := v[i+63:i+32]
+		dst[i+63:i+32] := v[i+31:i]
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+95:i+64]
+		dst[i+63:i+32]  := v[i+127:i+96]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+63:i+32]
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+31:i]
+		dst[i+63:i+32]  := v[i+31:i]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+31:i]
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+63:i+32]
+		dst[i+63:i+32]  := v[i+63:i+32]
+		dst[i+95:i+64]  := v[i+63:i+32]
+		dst[i+127:i+96] := v[i+63:i+32]
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+95:i+64]
+		dst[i+63:i+32]  := v[i+95:i+64]
+		dst[i+95:i+64]  := v[i+95:i+64]
+		dst[i+127:i+96] := v[i+95:i+64]
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+127:i+96]
+		dst[i+63:i+32]  := v[i+127:i+96]
+		dst[i+95:i+64]  := v[i+127:i+96]
+		dst[i+127:i+96] := v[i+127:i+96]
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]     := v[i+63:i+32]
+		dst[i+63:i+32]  := v[i+95:i+64]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+127:i+96]
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_swizzle_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="v" type="__m512d"/>
+	<parameter varname="s" type="_MM_SWIZZLE_ENUM"/>
+	<description>Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst".</description>
+	<operation>
+CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 3
+		i := j*64
+		dst[i+63:i]     := v[i+127:i+64]
+		dst[i+127:i+64] := v[i+63:i]
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]      := v[i+191:i+128]
+		dst[i+127:i+64]  := v[i+255:i+192]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+127:i+64]
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]      := v[i+63:i]
+		dst[i+127:i+64]  := v[i+63:i]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+63:i]
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]      := v[i+127:i+63]
+		dst[i+127:i+64]  := v[i+127:i+63]
+		dst[i+191:i+128] := v[i+127:i+63]
+		dst[i+255:i+192] := v[i+127:i+63]
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]      := v[i+191:i+128]
+		dst[i+127:i+64]  := v[i+191:i+128]
+		dst[i+191:i+128] := v[i+191:i+128]
+		dst[i+255:i+192] := v[i+191:i+128]
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+255:i+192]
+		dst[i+127:i+64]  := v[i+255:i+192]
+		dst[i+191:i+128] := v[i+255:i+192]
+		dst[i+255:i+192] := v[i+255:i+192]
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+127:i+64]
+		dst[i+127:i+64]  := v[i+191:i+128]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+255:i+192]
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_swizzle_epi32" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="v" type="__m512i"/>
+	<parameter varname="s" type="_MM_SWIZZLE_ENUM"/>
+	<description>Performs a swizzle transformation of each of the four groups of packed 4x 32-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst".</description>
+	<operation>
+CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 7
+		i := j*64
+		dst[i+31:i]    := v[i+63:i+32]
+		dst[i+63:i+32] := v[i+31:i]
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+95:i+64]
+		dst[i+63:i+32]  := v[i+127:i+96]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+63:i+32]
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+31:i]
+		dst[i+63:i+32]  := v[i+31:i]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+31:i]
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+63:i+32]
+		dst[i+63:i+32]  := v[i+63:i+32]
+		dst[i+95:i+64]  := v[i+63:i+32]
+		dst[i+127:i+96] := v[i+63:i+32]
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+95:i+64]
+		dst[i+63:i+32]  := v[i+95:i+64]
+		dst[i+95:i+64]  := v[i+95:i+64]
+		dst[i+127:i+96] := v[i+95:i+64]
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+127:i+96]
+		dst[i+63:i+32]  := v[i+127:i+96]
+		dst[i+95:i+64]  := v[i+127:i+96]
+		dst[i+127:i+96] := v[i+127:i+96]
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 3
+		i := j*128
+		dst[i+31:i]	    := v[i+63:i+32]
+		dst[i+63:i+32]  := v[i+95:i+64]
+		dst[i+95:i+64]  := v[i+31:i]
+		dst[i+127:i+96] := v[i+127:i+96]
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_swizzle_epi64" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="v" type="__m512i"/>
+	<parameter varname="s" type="_MM_SWIZZLE_ENUM"/>
+	<description>Performs a swizzle transformation of each of the two groups of packed 4x64-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst".</description>
+	<operation>
+CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 3
+		i := j*64
+		dst[i+63:i]	    := v[i+127:i+64]
+		dst[i+127:i+64] := v[i+63:i]
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+191:i+128]
+		dst[i+127:i+64]  := v[i+255:i+192]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+127:i+64]
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+63:i]
+		dst[i+127:i+64]  := v[i+63:i]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+63:i]
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+127:i+63]
+		dst[i+127:i+64]  := v[i+127:i+63]
+		dst[i+191:i+128] := v[i+127:i+63]
+		dst[i+255:i+192] := v[i+127:i+63]
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+191:i+128]
+		dst[i+127:i+64]  := v[i+191:i+128]
+		dst[i+191:i+128] := v[i+191:i+128]
+		dst[i+255:i+192] := v[i+191:i+128]
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+255:i+192]
+		dst[i+127:i+64]  := v[i+255:i+192]
+		dst[i+191:i+128] := v[i+255:i+192]
+		dst[i+255:i+192] := v[i+255:i+192]
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 1
+		i := j*256
+		dst[i+63:i]	     := v[i+127:i+64]
+		dst[i+127:i+64]  := v[i+191:i+128]
+		dst[i+191:i+128] := v[i+63:i]
+		dst[i+255:i+192] := v[i+255:i+192]
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_swizzle_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v" type="__m512"/>
+	<parameter varname="s" type="_MM_SWIZZLE_ENUM"/>
+	<description>Performs a swizzle transformation of each of the four groups of packed 4x single-precision (32-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 7
+		i := j*64
+		IF k[j*2]
+			dst[i+31:i]	:= v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	:= src[i+31:i]
+		FI
+		IF k[j*2+1]
+			dst[i+63:i+32] := v[i+31:i]
+		ELSE
+			dst[i+63:i+32] := src[i+63:i+32]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+95:i+64]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+127:i+96]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+63:i+32]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+31:i]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+31:i]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+31:i]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+63:i+32]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+63:i+32]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+63:i+32]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+95:i+64]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+95:i+64]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+95:i+64]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+95:i+64]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+127:i+96]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+127:i+96]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+127:i+96]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+127:i+96]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+95:i+64]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+127:i+96]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_swizzle_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v" type="__m512d"/>
+	<parameter varname="s" type="_MM_SWIZZLE_ENUM"/>
+	<description>Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 3
+		i := j*64
+		IF k[j*2]
+			dst[i+63:i]	 := v[i+127:i+64]
+		ELSE
+			dst[i+63:i]	 := src[i+63:i]
+		FI
+		IF k[j*2+1]
+			dst[i+127:i+64] := v[i+63:i]
+		ELSE
+			dst[i+127:i+64] := src[i+127:i+64]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+191:i+128]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+255:i+192]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+127:i+64]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+63:i]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+63:i]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+63:i]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+127:i+63]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+127:i+63]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+127:i+63]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+127:i+63]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+191:i+128]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+191:i+128]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+191:i+128]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+191:i+128]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+255:i+192]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+255:i+192]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+255:i+192]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+255:i+192]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+127:i+64]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+191:i+128]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+255:i+192]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_swizzle_epi32" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v" type="__m512i"/>
+	<parameter varname="s" type="_MM_SWIZZLE_ENUM"/>
+	<description>Performs a swizzle transformation of each of the four groups of packed 4x32-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 7
+		i := j*64
+		IF k[j*2]
+			dst[i+31:i]	:= v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	:= src[i+31:i]
+		FI
+		IF k[j*2+1]
+			dst[i+63:i+32] := v[i+31:i]
+		ELSE
+			dst[i+63:i+32] := src[i+63:i+32]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+95:i+64]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+127:i+96]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+63:i+32]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+31:i]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+31:i]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+31:i]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+63:i+32]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+63:i+32]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+63:i+32]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+95:i+64]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+95:i+64]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+95:i+64]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+95:i+64]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+127:i+96]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+127:i+96]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+127:i+96]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+127:i+96]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 3
+		i := j*128
+		IF k[j*4]
+			dst[i+31:i]	 := v[i+63:i+32]
+		ELSE
+			dst[i+31:i]	 := src[i+31:i]
+		FI
+		IF k[j*4+1]
+			dst[i+63:i+32]  := v[i+95:i+64]
+		ELSE
+			dst[i+63:i+32]  := src[i+63:i+32]
+		FI
+		IF k[j*4+2]
+			dst[i+95:i+64]  := v[i+31:i]
+		ELSE
+			dst[i+95:i+64]  := src[i+95:i+64]
+		FI
+		IF k[j*4+3]
+			dst[i+127:i+96] := v[i+127:i+96]
+		ELSE
+			dst[i+127:i+96] := src[i+127:i+96]
+		FI
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_swizzle_epi64" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v" type="__m512i"/>
+	<parameter varname="s" type="_MM_SWIZZLE_ENUM"/>
+	<description>Performs a swizzle transformation of each of the four groups of packed 4x64-bit integer elements in "v" using swizzle parameter "s", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+CASE s OF
+_MM_SWIZ_REG_NONE:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_DCBA:
+	dst[511:0] := v[511:0]
+_MM_SWIZ_REG_CDAB:
+	FOR j := 0 to 3
+		i := j*64
+		IF k[j*2]
+			dst[i+63:i]	 := v[i+127:i+64]
+		ELSE
+			dst[i+63:i]	 := src[i+63:i]
+		FI
+		IF k[j*2+1]
+			dst[i+127:i+64] := v[i+63:i]
+		ELSE
+			dst[i+127:i+64] := src[i+127:i+64]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BADC:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+191:i+128]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+255:i+192]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+127:i+64]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_AAAA:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+63:i]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+63:i]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+63:i]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_BBBB:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+127:i+63]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+127:i+63]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+127:i+63]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+127:i+63]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_CCCC:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+191:i+128]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+191:i+128]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+191:i+128]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+191:i+128]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DDDD:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+255:i+192]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+255:i+192]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+255:i+192]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+255:i+192]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+_MM_SWIZ_REG_DACB:
+	FOR j := 0 to 1
+		i := j*256
+		IF k[j*4]
+			dst[i+63:i]	  := v[i+127:i+64]
+		ELSE
+			dst[i+63:i]	  := src[i+63:i]
+		FI
+		IF k[j*4+1]
+			dst[i+127:i+64]  := v[i+191:i+128]
+		ELSE
+			dst[i+127:i+64]  := src[i+127:i+64]
+		FI
+		IF k[j*4+2]
+			dst[i+191:i+128] := v[i+63:i]
+		ELSE
+			dst[i+191:i+128] := src[i+191:i+128]
+		FI
+		IF k[j*4+3]
+			dst[i+255:i+192] := v[i+255:i+192]
+		ELSE
+			dst[i+255:i+192] := src[i+255:i+192]
+		FI
+	ENDFOR
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extstore_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v" type="__m512"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PS_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Downconverts packed single-precision (32-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]		
+FOR j := 0 to 15
+	i := j*32
+	CASE conv OF
+	_MM_DOWNCONV_PS_NONE:
+		addr[i+31:i] := v[i+31:i]
+	_MM_DOWNCONV_PS_FLOAT16:
+		n := j*16
+		addr[n+15:n] := Float32ToFloat16(v[i+31:i])
+	_MM_DOWNCONV_PS_UINT8:
+		n := j*8
+		addr[n+7:n] := Float32ToUInt8(v[i+31:i])
+	_MM_DOWNCONV_PS_SINT8:
+		n := j*8
+		addr[n+7:n] := Float32ToSInt8(v[i+31:i])
+	_MM_DOWNCONV_PS_UINT16:
+		n := j*16
+		addr[n+15:n] := Float32ToUInt16(v[i+31:i])
+	_MM_DOWNCONV_PS_SINT16:
+		n := j*16
+		addr[n+15:n] := Float32ToSInt16(v[i+31:i])
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="vmovaps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extstore_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Downconverts packed 32-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	CASE conv OF
+	_MM_DOWNCONV_EPI32_NONE:
+		addr[i+31:i] := v[i+31:i]
+	_MM_DOWNCONV_EPI32_UINT8:
+		n := j*8
+		addr[n+7:n] := Int32ToUInt8(v[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT8:
+		n := j*8
+		addr[n+7:n] := Int32ToSInt8(v[i+31:i])
+	_MM_DOWNCONV_EPI32_UINT16:
+		n := j*16
+		addr[n+15:n] := Int32ToUInt16(v[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT16:
+		n := j*16
+		addr[n+15:n] := Int32ToSInt16(v[i+31:i])
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="vmovdqa32" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extstore_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v" type="__m512d"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PD_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Downconverts packed double-precision (64-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	CASE conv OF
+	_MM_DOWNCONV_PS_NONE:
+		addr[i+63:i] := v[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="vmovapd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extstore_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Downconverts packed 64-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	CASE conv OF
+	_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="vmovdqa64" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extstore_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v" type="__m512"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PS_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Downconverts packed single-precision (32-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" using writemask "k" (elements are not written to memory when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_PS_NONE:
+			mt[i+31:i] := v[i+31:i]
+		_MM_DOWNCONV_PS_FLOAT16:
+			n := j*16
+			mt[n+15:n] := Float32ToFloat16(v[i+31:i])
+		_MM_DOWNCONV_PS_UINT8:
+			n := j*8
+			mt[n+7:n] := Float32ToUInt8(v[i+31:i])
+		_MM_DOWNCONV_PS_SINT8:
+			n := j*8
+			mt[n+7:n] := Float32ToSInt8(v[i+31:i])
+		_MM_DOWNCONV_PS_UINT16:
+			n := j*16
+			mt[n+15:n] := Float32ToUInt16(v[i+31:i])
+		_MM_DOWNCONV_PS_SINT16:
+			n := j*16
+			mt[n+15:n] := Float32ToSInt16(v[i+31:i])
+		ESAC
+	 FI
+ENDFOR
+	</operation>
+	<instruction name="vmovaps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extstore_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v" type="__m512d"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PD_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Downconverts packed double-precision (64-bit) floating-point elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]		
+FOR j := 0 to 7
+	i := j*64
+	CASE conv OF
+	_MM_DOWNCONV_PD_NONE:
+		IF k[j]
+			mt[i+63:i] := v[i+63:i]
+		FI
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="vmovapd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extstore_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Downconverts packed 32-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI32_NONE:
+			addr[i+31:i] := v[i+31:i]
+		_MM_DOWNCONV_EPI32_UINT8:
+			n := j*8
+			addr[n+7:n] := Int32ToUInt8(v[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT8:
+			n := j*8
+			addr[n+7:n] := Int32ToSInt8(v[i+31:i])
+		_MM_DOWNCONV_EPI32_UINT16:
+			n := j*16
+			addr[n+15:n] := Int32ToUInt16(v[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT16:
+			n := j*16
+			addr[n+15:n] := Int32ToSInt16(v[i+31:i])
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqa32" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extstore_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Downconverts packed 64-bit integer elements stored in "v" to a smaller type depending on "conv" and stores them in memory location "mt" (elements in "mt" are unaltered when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i]
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqa64" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_storenr_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v" type="__m512"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements from "v" to memory address "mt" with a no-read hint to the processor.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	addr[i+31:i] := v[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="vmovnraps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_storenr_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v" type="__m512d"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements from "v" to memory address "mt" with a no-read hint to the processor.</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	addr[i+63:i] := v[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="vmovnrapd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_storenrngo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v" type="__m512"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements from "v" to memory address "mt" with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them).</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	addr[i+31:i] := v[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="vmovnrngoaps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_storenrngo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v" type="__m512d"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements from "v" to memory address "mt" with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them).</description>
+	<operation>
+addr := MEM[mt]
+FOR j := 0 to 7
+	i := j*64
+	addr[i+63:i] := v[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="vmovnrngoapd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_adc_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="k2_res" type="__mmask16 *"/>
+	<description>Performs element-by-element addition of packed 32-bit integers in "v2" and "v3" and the corresponding bit in "k2", storing the result of the addition in "dst" and the result of the carry in "k2_res".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k2_res[j]   := Carry(v2[i+31:i] + v3[i+31:i] + k2[j])
+	dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpadcd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_adc_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="k2_res" type="__mmask16 *"/>
+	<description>Performs element-by-element addition of packed 32-bit integers in "v2" and "v3" and the corresponding bit in "k2", storing the result of the addition in "dst" and the result of the carry in "k2_res" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k2_res[j]   := Carry(v2[i+31:i] + v3[i+31:i] + k2[j])
+		dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j]
+	ELSE
+		dst[i+31:i] := v2[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpadcd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_addn_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="v3" type="__m512d"/>
+	<description>Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddnpd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_addn_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="v3" type="__m512d"/>
+	<description>Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddnpd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_addn_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<description>Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddnps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_addn_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<description>Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates their sum, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddnps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_addn_round_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="v3" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element by element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddnpd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_addn_round_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="v3" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element by element addition between packed double-precision (64-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddnpd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_addn_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element by element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddnps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_addn_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element by element addition between packed single-precision (32-bit) floating-point elements in "v2" and "v3" and negates the sum, storing the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddnps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_subr_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="v3" type="__m512d"/>
+	<description>Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vsubrpd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_subr_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="v3" type="__m512d"/>
+	<description>Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vsubrpd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_subr_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<description>Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vsubrps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_subr_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<description>Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vsubrps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_subr_round_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="v3" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vsubrpd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_subr_round_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="v3" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vsubrpd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_subr_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v3[i+31:i] - v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vsubrps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_subr_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vsubrps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_subr_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="v3" type="__m512i"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3" storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubrd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_subr_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="v3" type="__m512i"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3" storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set)</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubrd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_addsetc_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="k2_res" type="__mmask16 *"/>
+	<description>Performs element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the resultant carry in "k2_res" (carry flag) and the addition results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+	k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddsetcd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_addsetc_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="k_old" type="__mmask16"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="k2_res" type="__mmask16 *"/>
+	<description>Performs element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the resultant carry in "k2_res" (carry flag) and the addition results in "dst" using writemask "k" (elements are copied from "v2" and "k_old" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+	ELSE
+		dst[i+31:i] := v2[i+31:i]
+		k2_res[j] := k_old[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddsetcd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_addsets_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="sign" type="__mmask16 *"/>
+	<description>Performs an element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+	sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddsetsd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_addsets_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="sign" type="__mmask16 *"/>
+	<description>Performs an element-by-element addition of packed 32-bit integer elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+		sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddsetsd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_addsets_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<parameter varname="sign" type="__mmask16 *"/>
+	<description>Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+	sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddsetsps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_addsets_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<parameter varname="sign" type="__mmask16 *"/>
+	<description>Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+		sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddsetsps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_addsets_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<parameter varname="sign" type="__mmask16 *"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+	sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddsetsps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_addsets_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512"/>
+	<parameter varname="sign" type="__mmask16 *"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in "v2" and "v3", storing the results in "dst" and the sign of the sum in "sign" (sign flag). Results are stored using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
+		sign[j] := v2[i+31:i] &amp; v3[i+31:i] &amp; 0x80000000
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vaddsetsps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_subsetb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="borrow" type="__mmask16 *"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v3" from "v2", storing the results in "dst" and the nth borrow bit in the nth position of "borrow" (borrow flag).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] - v3[i+31:i]
+	borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubsetbd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_subsetb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="k_old" type="__mmask16"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="borrow" type="__mmask16 *"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v3" from "v2", storing the results in "dst" and the nth borrow bit in the nth position of "borrow" (borrow flag). Results are stored using writemask "k" (elements are copied from "v2" and "k_old" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] - v3[i+31:i]
+		borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+		borrow[j] := k_old[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubsetbd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_subrsetb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="borrow" type="__mmask16 *"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3", storing the results in "dst" and "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
+	borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubrsetbd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_subrsetb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="k_old" type="__mmask16"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="borrow" type="__mmask16 *"/>
+	<description>Performs element-by-element subtraction of packed 32-bit integer elements in "v2" from "v3", storing the results in "dst" and "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are written using writemask "k" (elements are copied from "k" to "k_old" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		diff := v3[i+31:i] - v2[i+31:i]
+		borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i])
+		dst[i+31:i] := diff
+		v2[i+31:i] := diff
+	ELSE
+		borrow[j] := k_old[j]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubrsetbd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_sbb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="borrow" type="__mmask16 *"/>
+	<description>Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v3" as well as the corresponding bit from "k" from "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k[j]
+	borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsbbd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_sbb_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="borrow" type="__mmask16 *"/>
+	<description>Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v3" as well as the corresponding bit from "k2" from "v2". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k2[j]
+		borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k2[j])
+	ELSE
+		dst[i+31:i] := v2[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsbbd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_sbbr_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="borrow" type="__mmask16 *"/>
+	<description>Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v2" as well as the corresponding bit from "k" from "v3". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k[j]
+	borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsbbrd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_sbbr_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<parameter varname="v3" type="__m512i"/>
+	<parameter varname="borrow" type="__mmask16 *"/>
+	<description>Performs element-by-element three-input subtraction of packed 32-bit integer elements of "v2" as well as the corresponding bit from "k2" from "v3". The borrowed value from the subtraction difference for the nth element is written to the nth bit of "borrow" (borrow flag). Results are stored in "dst" using writemask "k1" (elements are copied from "v2" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k2[j]
+		borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j])
+	ELSE
+		dst[i+31:i] := v2[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsbbrd" form="zmm {k}, k, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="v3" type="__m512i"/>
+	<description>Performs element-by-element bitwise AND between packed 32-bit integer elements of "v2" and "v3", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := v2[i+31:i] &amp; v3[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpandd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_cvt_roundpd_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed single-precision (32-bit) floating-point elements, storing the results in "dst". Results are written to the lower half of "dst", and the upper half locations are set to '0'.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k := j*32
+	dst[k+31:k] := Float64ToFloat32(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2ps" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_cvt_roundpd_pslo">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed single-precision (32-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are written to the lower half of "dst", and the upper half locations are set to '0'.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Float64ToFloat32(v2[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2ps" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_cvtfxpnt_roundpd_epu32lo">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements, storing the results in "dst". Results are written to the lower half of "dst", and the upper half locations are set to '0'.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k := j*32
+	dst[k+31:k] := Float64ToInt32(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtfxpntpd2udq" form="zmm {k}, zmm, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_cvtfxpnt_roundpd_epu32lo">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are written to the lower half of "dst", and the upper half locations are set to '0'.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Float64ToInt32(v2[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtfxpntpd2udq" form="zmm {k}, zmm, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_cvtpslo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="v2" type="__m512"/>
+	<description>Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k := j*64
+	dst[k+63:k] := Float32ToFloat64(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2pd" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_cvtpslo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512"/>
+	<description>Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[l+63:l] := Float32ToFloat64(v2[i+31:i])
+	ELSE
+		dst[l+63:l] := src[l+63:l]:
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2pd" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_cvtfxpnt_round_adjustps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<parameter varname="expadj" type="_MM_EXP_ADJ_ENUM"/>
+	<description>Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in "v2" to packed 32-bit integer elements and performs an optional exponent adjust using "expadj", storing the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := Float32ToInt32(v2[i+31:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
+	_MM_EXPADJ_4:	 dst[i+31:i] = dst[i+31:i] * 2**4
+	_MM_EXPADJ_5:	 dst[i+31:i] = dst[i+31:i] * 2**5
+	_MM_EXPADJ_8:	 dst[i+31:i] = dst[i+31:i] * 2**8
+	_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
+	_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
+	_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
+	_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtfxpntps2dq" form="zmm {k}, zmm, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_cvtfxpnt_round_adjustps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<parameter varname="expadj" type="_MM_EXP_ADJ_ENUM"/>
+	<description>Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in "v2" to packed 32-bit unsigned integer elements and performing an optional exponent adjust using "expadj", storing the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := Float32ToUInt32(v2[i+31:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i]  0
+	_MM_EXPADJ_4:	 dst[i+31:i] = dst[i+31:i]  4
+	_MM_EXPADJ_5:	 dst[i+31:i] = dst[i+31:i]  5
+	_MM_EXPADJ_8:	 dst[i+31:i] = dst[i+31:i]  8
+	_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i]  16
+	_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i]  24
+	_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i]  31
+	_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i]  32
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtfxpntps2udq" form="zmm {k}, zmm, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_cvtepi32lo_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="v2" type="__m512i"/>
+	<description>Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	dst[l+63:l] := Int32ToFloat64(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtdq2pd" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_cvtepi32lo_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512i"/>
+	<description>Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := j*64
+	IF k[j]
+		dst[k+63:k] := Int32ToFloat64(v2[i+31:i])
+	ELSE
+		dst[n+63:n] := src[n+63:n]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtdq2pd" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_cvtepu32lo_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="v2" type="__m512i"/>
+	<description>Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k := j*64
+	dst[k+63:k] := UInt32ToFloat64(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtudq2pd" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_cvtepu32lo_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512i"/>
+	<description>Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[l+63:l] := UInt32ToFloat64(v2[i+31:i])
+	ELSE
+		dst[l+63:l] := src[l+63:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtudq2pd" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_cvtfxpnt_round_adjustepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<parameter varname="expadj" type="_MM_EXP_ADJ_ENUM"/>
+	<description>Performs element-by-element conversion of packed 32-bit unsigned integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := UInt32ToFloat32(v2[i+31:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
+	_MM_EXPADJ_4:	 dst[i+31:i] = dst[i+31:i] * 2**4
+	_MM_EXPADJ_5:	 dst[i+31:i] = dst[i+31:i] * 2**5
+	_MM_EXPADJ_8:	 dst[i+31:i] = dst[i+31:i] * 2**8
+	_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
+	_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
+	_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
+	_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtfxpntudq2ps" form="zmm {k}, zmm, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_cvtfxpnt_round_adjustepu32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<parameter varname="expadj" type="_MM_EXP_ADJ_ENUM"/>
+	<description>Performs element-by-element conversion of packed 32-bit unsigned integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Int32ToFloat32(v2[i+31:i])
+		CASE expadj OF
+		_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
+		_MM_EXPADJ_4:	 dst[i+31:i] = dst[i+31:i] * 2**4
+		_MM_EXPADJ_5:	 dst[i+31:i] = dst[i+31:i] * 2**5
+		_MM_EXPADJ_8:	 dst[i+31:i] = dst[i+31:i] * 2**8
+		_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
+		_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
+		_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
+		_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtfxpntudq2ps" form="zmm {k}, zmm, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_exp223_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="v2" type="__m512i"/>
+	<description>Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in "v2" with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := exp223(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vexp223ps" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_exp223_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512i"/>
+	<description>Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in "v2" with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := exp223(v2[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vexp223ps" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_fixupnan_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="v3" type="__m512i"/>
+	<description>Fixes up NaN's from packed double-precision (64-bit) floating-point elements in "v1" and "v2", storing the results in "dst" and storing the quietized NaN's from "v1" in "v3".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i])
+	v3[i+63:i] := QuietizeNaNs(v1[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vfixupnanpd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_fixupnan_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="v3" type="__m512i"/>
+	<description>Fixes up NaN's from packed double-precision (64-bit) floating-point elements in "v1" and "v2", storing the results in "dst" using writemask "k" (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from "v1" are stored in "v3".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i])
+		v3[i+63:i] := QuietizeNaNs(v1[i+63:i])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vfixupnanpd" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_fixupnan_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="v1" type="__m512"/>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512i"/>
+	<description>Fixes up NaN's from packed single-precision (32-bit) floating-point elements in "v1" and "v2", storing the results in "dst" and storing the quietized NaN's from "v1" in "v3".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i])
+	v3[i+31:i] := QuietizeNaNs(v1[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vfixupnanps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_fixupnan_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="v1" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512"/>
+	<parameter varname="v3" type="__m512i"/>
+	<description>Fixes up NaN's from packed single-precision (32-bit) floating-point elements in "v1" and "v2", storing the results in "dst" using writemask "k" (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from "v1" are stored in "v3".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i])
+		v3[i+31:i] := QuietizeNaNs(v1[i+31:i])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vfixupnanps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_i32extgather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI32_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 16 memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst". AVX512 supports _MM_UPCONV_EPI32_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:
+		dst[i+31:i] := addr[i+31:i]
+	_MM_UPCONV_EPI32_UINT8:
+		n := j*7
+		dst[i+31:i] := UInt8ToUInt32(addr[n+7:n])
+	_MM_UPCONV_EPI32_SINT8:
+		n := j*7
+		dst[i+31:i] := Int8ToInt32(addr[n+7:n])
+	_MM_UPCONV_EPI32_UINT16:
+		n := j*16
+		dst[i+31:i] := UInt16ToUInt32(addr[n+15:n])
+	_MM_UPCONV_EPI32_SINT16:
+		n := j*16
+		dst[i+31:i] := Int16ToInt32(addr[n+15:n])
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpgatherdd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_i32extgather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI32_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 16 single-precision (32-bit) memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). AVX512 supports _MM_UPCONV_EPI32_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_EPI32_NONE:
+			dst[i+31:i] := addr[i+31:i]
+		_MM_UPCONV_EPI32_UINT8:
+			n := j*7
+			dst[i+31:i] := UInt8ToUInt32(addr[n+7:n])
+		_MM_UPCONV_EPI32_SINT8:
+			n := j*7
+			dst[i+31:i] := Int8ToInt32(addr[n+7:n])
+		_MM_UPCONV_EPI32_UINT16:
+			n := j*16
+			dst[i+31:i] := UInt16ToUInt32(addr[n+15:n])
+		_MM_UPCONV_EPI32_SINT16:
+			n := j*16
+			dst[i+31:i] := Int16ToInt32(addr[n+15:n])
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpgatherdd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_i32loextgather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI64_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 double-precision (64-bit) memory locations starting at location "mv" at packed 32-bit integer indices stored in the lower half of "index" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i]
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpgatherdq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_i32loextgather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI64_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 double-precision (64-bit) memory locations starting at location "mv" at packed 32-bit integer indices stored in the lower half of "index" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i]
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpgatherdq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_i32extgather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 16 memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:
+		dst[i+31:i] := addr[i+31:i]
+	_MM_UPCONV_PS_FLOAT16:
+		n := j*16
+		dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
+	_MM_UPCONV_PS_UINT8:
+		n := j*8
+		dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
+	_MM_UPCONV_PS_SINT8:
+		n := j*8
+		dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
+	_MM_UPCONV_PS_UINT16:
+		n := j*16
+		dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
+	_MM_UPCONV_PS_SINT16:
+		n := j*16
+		dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgatherdps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_i32extgather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 16 single-precision (32-bit) memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_PS_NONE:
+			dst[i+31:i] := addr[i+31:i]
+		_MM_UPCONV_PS_FLOAT16:
+			n := j*16
+			dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
+		_MM_UPCONV_PS_UINT8:
+			n := j*8
+			dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
+		_MM_UPCONV_PS_SINT8:
+			n := j*8
+			dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
+		_MM_UPCONV_PS_UINT16:
+			n := j*16
+			dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
+		_MM_UPCONV_PS_SINT16:
+			n := j*16
+			dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgatherdps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_i32loextgather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PD_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location "mv" at packed 32-bit integer indices stored in the lower half of "index" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	CASE conv OF
+	_MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i]
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgatherdpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_i32loextgather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PD_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location "mv" at packed 32-bit integer indices stored in the lower half of "index" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i]
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgatherdpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_prefetch_i32extgather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "mv" and 32-bit integer index vector "index" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
+The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic.</description>
+	<operation>
+FOR j := 0 to 15
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	CASE hint OF
+	_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
+	_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgatherpf0dps" form="m512 {k}" xed=""/>
+	<instruction name="vgatherpf1dps" form="m512 {k}" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_prefetch_i32extgather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "mv" and 32-bit integer index vector "index" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". Gathered elements are merged in cache using writemask "k" (elements are brought into cache only when their corresponding mask bits are set). The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
+The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic.</description>
+	<operation>
+FOR j := 0 to 15
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	IF k[j] THEN
+		CASE hint OF
+		_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
+		_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
+		ESAC
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgatherpf0dps" form="m512 {k}" xed=""/>
+	<instruction name="vgatherpf1dps" form="m512 {k}" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_i32extscatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="scale" type="int"/>
+	<description>Down-converts 16 packed single-precision (32-bit) floating-point elements in "v1" and stores them in memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale" using "conv".</description>
+	<operation>
+FOR j := 0 to 15
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	CASE conv OF
+	_MM_DOWNCONV_PS_NONE:
+		n := j*32
+		addr[i+31:i] := v1[n+31:n]
+	_MM_DOWNCONV_PS_FLOAT16:
+		i := j*16
+		addr[i+15:i] := Float32ToFloat16(v1[n+31:n])
+	_MM_DOWNCONV_PS_UINT8:
+		i := j*8
+		addr[i+7:i] := Float32ToUInt8(v1[n+31:n])
+	_MM_DOWNCONV_PS_SINT8:
+		i := j*8
+		addr[i+7:i] := Float32ToSInt8(v1[n+31:n])
+	_MM_DOWNCONV_PS_UINT16:
+		i := j*8
+		addr[i+15:i] := Float32ToUInt16(v1[n+31:n])
+	_MM_DOWNCONV_PS_SINT16:
+		i := j*8
+		addr[i+15:i] := Float32ToSInt16(v1[n+31:n])
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="vscatterdps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_i32extscatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 16 packed single-precision (32-bit) floating-point elements in "v1" according to "conv" and stores them in memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale" using writemask "k" (elements are written only when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		CASE conv OF
+		_MM_DOWNCONV_PS_NONE:
+			n := j*32
+			addr[i+31:i] := v1[n+31:n]
+		_MM_DOWNCONV_PS_FLOAT16:
+			i := j*16
+			addr[i+15:i] := Float32ToFloat16(v1[n+31:n])
+		_MM_DOWNCONV_PS_UINT8:
+			i := j*8
+			addr[i+7:i] := Float32ToUInt8(v1[n+31:n])
+		_MM_DOWNCONV_PS_SINT8:
+			i := j*8
+			addr[i+7:i] := Float32ToSInt8(v1[n+31:n])
+		_MM_DOWNCONV_PS_UINT16:
+			i := j*8
+			addr[i+15:i] := Float32ToUInt16(v1[n+31:n])
+		_MM_DOWNCONV_PS_SINT16:
+			i := j*8
+			addr[i+15:i] := Float32ToSInt16(v1[n+31:n])
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vscatterdps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_i32loextscatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PD_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 8 packed double-precision (64-bit) floating-point elements in "v1" and stores them in memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale" using "conv".</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	CASE conv OF
+	_MM_DOWNCONV_PD_NONE: addr[i+63:i] := v1[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="vscatterdpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_i32loextscatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PD_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 8 packed double-precision (64-bit) floating-point elements in "v1" and stores them in memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale" using "conv". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory.</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		i := j*64
+		CASE conv OF
+		_MM_DOWNCONV_PD_NONE: addr[i+63:i] := v1[i+63:i]
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vscatterdpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_i32loextscatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI64_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 8 packed 64-bit integer elements in "v1" and stores them in memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale" using "conv".</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	CASE conv OF
+	_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="vpscatterdq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_i32loextscatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI64_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 8 packed 64-bit integer elements in "v1" and stores them in memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale" using "conv". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory.</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		i := j*64
+		CASE conv OF
+		_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i]
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vpscatterdq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_prefetch_i32extscatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "mv" and 32-bit integer index vector "index" with scale "scale" to L1 or L2 level of cache depending on the value of "hint", with a request for exclusive ownership. The "hint" parameter may be one of the following: _MM_HINT_T0 = 1 for prefetching to L1 cache, _MM_HINT_T1 = 2 for prefetching to L2 cache, _MM_HINT_T2 = 3 for prefetching to L2 cache non-temporal, _MM_HINT_NTA = 0 for prefetching to L1 cache non-temporal. The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent scatter intrinsic.</description>
+	<operation>
+cachev := 0
+FOR j := 0 to 15
+	i := j*32
+	addr := MEM[mv + index[j] * scale]
+	CASE hint OF
+	_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
+	_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
+	_MM_HINT_T2: PrefetchL2WithT1HintNonTemporal(addr[i+31:i])
+	_MM_HINT_NTA: PrefetchL1WithT0HintNonTemporal(addr[i+31:i])
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="vscatterpf0dps" form="m512 {k}" xed=""/>
+	<instruction name="vscatterpf1dps" form="m512 {k}" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_prefetch_i32extscatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address "mv" and 32-bit integer index vector "index" with scale "scale" to L1 or L2 level of cache depending on the value of "hint". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
+The "conv" parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the "conv" parameter specified for the subsequent gather intrinsic. Only those elements whose corresponding mask bit in "k" is set are loaded into cache.</description>
+	<operation>
+cachev := 0
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		CASE hint OF
+		_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
+		_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vscatterpf0dps" form="m512 {k}" xed=""/>
+	<instruction name="vscatterpf1dps" form="m512 {k}" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_extloadunpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:   RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_EPI32_UINT8:  RETURN UInt8ToInt32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_SINT8:  RETURN SInt8ToInt32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset])
+	_MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset])
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:   RETURN 4
+	_MM_UPCONV_EPI32_UINT8:  RETURN 1
+	_MM_UPCONV_EPI32_SINT8:  RETURN 1
+	_MM_UPCONV_EPI32_UINT16: RETURN 2
+	_MM_UPCONV_EPI32_SINT16: RETURN 2
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*upSize % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*32
+		dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_extloadunpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:   RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_EPI32_UINT8:  RETURN UInt8ToInt32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_SINT8:  RETURN SInt8ToInt32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset])
+	_MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset])
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:   RETURN 4
+	_MM_UPCONV_EPI32_UINT8:  RETURN 1
+	_MM_UPCONV_EPI32_SINT8:  RETURN 1
+	_MM_UPCONV_EPI32_UINT16: RETURN 2
+	_MM_UPCONV_EPI32_SINT16: RETURN 2
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*upSize % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*32
+			dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_extloadunpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:   RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_EPI32_UINT8:  RETURN UInt8ToInt32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_SINT8:  RETURN SInt8ToInt32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset])
+	_MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset])
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:   RETURN 4
+	_MM_UPCONV_EPI32_UINT8:  RETURN 1
+	_MM_UPCONV_EPI32_SINT8:  RETURN 1
+	_MM_UPCONV_EPI32_UINT16: RETURN 2
+	_MM_UPCONV_EPI32_SINT16: RETURN 2
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+	loadOffset := loadOffset + 1
+	IF (mt + loadOffset * upSize) % 64 == 0
+		break
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackld" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_extloadunpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:   RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_EPI32_UINT8:  RETURN UInt8ToInt32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_SINT8:  RETURN SInt8ToInt32(MEM[addr + offset])
+	_MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset])
+	_MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset])
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:   RETURN 4
+	_MM_UPCONV_EPI32_UINT8:  RETURN 1
+	_MM_UPCONV_EPI32_SINT8:  RETURN 1
+	_MM_UPCONV_EPI32_UINT16: RETURN 2
+	_MM_UPCONV_EPI32_SINT16: RETURN 2
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+		loadOffset := loadOffset + 1
+		IF (mt + loadOffset * upSize) % 64 == 0
+			break
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackld" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_extloadunpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:   RETURN MEM[addr + 8*offset]
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:   RETURN 8
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*upSize) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*64
+		dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_extloadunpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:   RETURN MEM[addr + 8*offset]
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:   RETURN 8
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*upSize) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*64
+			dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_extloadunpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:   RETURN MEM[addr + 8*offset]
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:   RETURN 8
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+	loadOffset := loadOffset + 1
+	IF (addr + loadOffset*upSize % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_extloadunpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:   RETURN MEM[addr + 8*offset]
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE:   RETURN 8
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*64
+		dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+		loadOffset := loadOffset + 1
+		IF (addr + loadOffset*upSize % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_extloadunpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:	   RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset])
+	_MM_UPCONV_PS_UINT8:   RETURN UInt8ToFloat32(MEM[addr + offset])
+	_MM_UPCONV_PS_SINT8:   RETURN SInt8ToFloat32(MEM[addr + offset])
+	_MM_UPCONV_PS_UINT16:  RETURN UInt16ToFloat32(MEM[addr + 2*offset])
+	_MM_UPCONV_PS_SINT16:  RETURN SInt16ToFloat32(MEM[addr + 2*offset])
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:	   RETURN 4
+	_MM_UPCONV_PS_FLOAT16: RETURN 2
+	_MM_UPCONV_PS_UINT8:   RETURN 1
+	_MM_UPCONV_PS_SINT8:   RETURN 1
+	_MM_UPCONV_PS_UINT16:  RETURN 2
+	_MM_UPCONV_PS_SINT16:  RETURN 2
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*upSize % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*32
+		dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_extloadunpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:	   RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset])
+	_MM_UPCONV_PS_UINT8:   RETURN UInt8ToFloat32(MEM[addr + offset])
+	_MM_UPCONV_PS_SINT8:   RETURN SInt8ToFloat32(MEM[addr + offset])
+	_MM_UPCONV_PS_UINT16:  RETURN UInt16ToFloat32(MEM[addr + 2*offset])
+	_MM_UPCONV_PS_SINT16:  RETURN SInt16ToFloat32(MEM[addr + 2*offset])
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:	   RETURN 4
+	_MM_UPCONV_PS_FLOAT16: RETURN 2
+	_MM_UPCONV_PS_UINT8:   RETURN 1
+	_MM_UPCONV_PS_SINT8:   RETURN 1
+	_MM_UPCONV_PS_UINT16:  RETURN 2
+	_MM_UPCONV_PS_SINT16:  RETURN 2
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*upSize % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*32
+			dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_extloadunpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:	   RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset])
+	_MM_UPCONV_PS_UINT8:   RETURN UInt8ToFloat32(MEM[addr + offset])
+	_MM_UPCONV_PS_SINT8:   RETURN SInt8ToFloat32(MEM[addr + offset])
+	_MM_UPCONV_PS_UINT16:  RETURN UInt16ToFloat32(MEM[addr + 2*offset])
+	_MM_UPCONV_PS_SINT16:  RETURN SInt16ToFloat32(MEM[addr + 2*offset])
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:	   RETURN 4
+	_MM_UPCONV_PS_FLOAT16: RETURN 2
+	_MM_UPCONV_PS_UINT8:   RETURN 1
+	_MM_UPCONV_PS_SINT8:   RETURN 1
+	_MM_UPCONV_PS_UINT16:  RETURN 2
+	_MM_UPCONV_PS_SINT16:  RETURN 2
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr = MEM[mt]
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+	loadOffset := loadOffset + 1
+	IF (mt + loadOffset * upSize) % 64 == 0
+		break
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_extloadunpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:    RETURN MEM[addr + 4*offset]
+	_MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset])
+	_MM_UPCONV_PS_UINT8:   RETURN UInt8ToFloat32(MEM[addr + offset])
+	_MM_UPCONV_PS_SINT8:   RETURN SInt8ToFloat32(MEM[addr + offset])
+	_MM_UPCONV_PS_UINT16:  RETURN UInt16ToFloat32(MEM[addr + 2*offset])
+	_MM_UPCONV_PS_SINT16:  RETURN SInt16ToFloat32(MEM[addr + 2*offset])
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:	   RETURN 4
+	_MM_UPCONV_PS_FLOAT16: RETURN 2
+	_MM_UPCONV_PS_UINT8:   RETURN 1
+	_MM_UPCONV_PS_SINT8:   RETURN 1
+	_MM_UPCONV_PS_UINT16:  RETURN 2
+	_MM_UPCONV_PS_SINT16:  RETURN 2
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr = MEM[mt]
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
+		loadOffset := loadOffset + 1
+		IF (mt + loadOffset * upSize) % 64 == 0
+			break
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_extloadunpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PD_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE: RETURN MEM[addr + 8*offset]
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE: RETURN 8
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*upSize) % 64 == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*64
+		dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_extloadunpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PD_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE: RETURN MEM[addr + 8*offset]
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE: RETURN 8
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+upSize := UPCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*upSize) % 64 == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*64
+			dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_extloadunpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PD_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal.</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:	RETURN MEM[addr + 8*offset]
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:	RETURN 8
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+	loadOffset := loadOffset + 1
+	IF (mt + loadOffset * upSize) % 64 == 0
+		break
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_extloadunpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mt" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PD_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of "conv", and expanded into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". "hint" indicates to the processor whether the loaded data is non-temporal. Elements are copied to "dst" according to element selector "k" (elemenst are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+UPCONVERT(address, offset, convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:	RETURN MEM[addr + 8*offset]
+	ESAC
+}
+
+UPCONVERTSIZE(convertTo) {
+	CASE conv OF
+	_MM_UPCONV_PD_NONE:	RETURN 8
+	ESAC
+}
+
+dst[511:0] := src[511:0]
+loadOffset := 0
+upSize := UPCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*64
+		dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
+		loadOffset := loadOffset + 1
+		IF (mt + loadOffset * upSize) % 64 == 0
+			break
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extpackstorehi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_DOWNCONV_EPI32_NONE:   RETURN element[i+31:i]
+	_MM_DOWNCONV_EPI32_UINT8:  RETURN UInt32ToUInt8(element[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT8:  RETURN SInt32ToSInt8(element[i+31:i])
+	_MM_DOWNCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_DOWNCONV_EPI32_NONE:   RETURN 4
+	_MM_DOWNCONV_EPI32_UINT8:  RETURN 1
+	_MM_DOWNCONV_EPI32_SINT8:  RETURN 1
+	_MM_DOWNCONV_EPI32_UINT16: RETURN 2
+	_MM_DOWNCONV_EPI32_SINT16: RETURN 2
+	ESAC
+}
+
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+			foundNext64BytesBoundary = true
+		FI
+	ELSE
+		i := j*32
+		tmp := DOWNCONVERT(v1[i+31:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		4: MEM[storeAddr] := tmp[31:0]
+		2: MEM[storeAddr] := tmp[15:0]
+		1: MEM[storeAddr] := tmp[7:0]
+		ESAC
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extpackstorehi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresonding mask bit is not set).</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_DOWNCONV_EPI32_NONE:   RETURN element[i+31:i]
+	_MM_DOWNCONV_EPI32_UINT8:  RETURN UInt32ToUInt8(element[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT8:  RETURN SInt32ToSInt8(element[i+31:i])
+	_MM_DOWNCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_DOWNCONV_EPI32_NONE:   RETURN 4
+	_MM_DOWNCONV_EPI32_UINT8:  RETURN 1
+	_MM_DOWNCONV_EPI32_SINT8:  RETURN 1
+	_MM_DOWNCONV_EPI32_UINT16: RETURN 2
+	_MM_DOWNCONV_EPI32_SINT16: RETURN 2
+	ESAC
+}
+
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+				foundNext64BytesBoundary = true
+			FI
+		ELSE
+			i := j*32
+			tmp := DOWNCONVERT(v1[i+31:i], conv)
+			storeAddr := addr + storeOffset * downSize
+			CASE downSize OF
+			4: MEM[storeAddr] := tmp[31:0]
+			2: MEM[storeAddr] := tmp[15:0]
+			1: MEM[storeAddr] := tmp[7:0]
+			ESAC
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extpackstorelo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_DOWNCONV_EPI32_NONE:   RETURN element[i+31:i]
+	_MM_DOWNCONV_EPI32_UINT8:  RETURN UInt32ToUInt8(element[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT8:  RETURN SInt32ToSInt8(element[i+31:i])
+	_MM_DOWNCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_DOWNCONV_EPI32_NONE:   RETURN 4
+	_MM_DOWNCONV_EPI32_UINT8:  RETURN 1
+	_MM_DOWNCONV_EPI32_SINT8:  RETURN 1
+	_MM_DOWNCONV_EPI32_UINT16: RETURN 2
+	_MM_DOWNCONV_EPI32_SINT16: RETURN 2
+	ESAC
+}
+
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 15
+	i := j*32
+	tmp := DOWNCONVERT(v1[i+31:i], conv)
+	storeAddr := addr + storeOffset * downSize
+	CASE downSize OF
+	4: MEM[storeAddr] := tmp[31:0]
+	2: MEM[storeAddr] := tmp[15:0]
+	1: MEM[storeAddr] := tmp[7:0]
+	ESAC
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset * downSize) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstoreld" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extpackstorelo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI32_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed 32-bit integer elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are written to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_DOWNCONV_EPI32_NONE:   RETURN element[i+31:i]
+	_MM_DOWNCONV_EPI32_UINT8:  RETURN UInt32ToUInt8(element[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT8:  RETURN SInt32ToSInt8(element[i+31:i])
+	_MM_DOWNCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_DOWNCONV_EPI32_NONE:   RETURN 4
+	_MM_DOWNCONV_EPI32_UINT8:  RETURN 1
+	_MM_DOWNCONV_EPI32_SINT8:  RETURN 1
+	_MM_DOWNCONV_EPI32_UINT16: RETURN 2
+	_MM_DOWNCONV_EPI32_SINT16: RETURN 2
+	ESAC
+}
+
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		tmp := DOWNCONVERT(v1[i+31:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		4: MEM[storeAddr] := tmp[31:0]
+		2: MEM[storeAddr] := tmp[15:0]
+		1: MEM[storeAddr] := tmp[7:0]
+		ESAC
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset * downSize) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstoreld" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extpackstorehi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_EPI64_NONE: RETURN element[i+63:i]
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_EPI64_NONE: RETURN 8
+	ESAC
+}
+
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+			foundNext64BytesBoundary = true
+		FI
+	ELSE
+		i := j*64
+		tmp := DOWNCONVERT(v1[i+63:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		8: MEM[storeAddr] := tmp[63:0]
+		ESAC
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extpackstorehi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (mt-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresonding mask bit is not set).</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_EPI64_NONE: RETURN element[i+63:i]
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_EPI64_NONE: RETURN 8
+	ESAC
+}
+
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+				foundNext64BytesBoundary = true
+			FI
+		ELSE
+			i := j*64
+			tmp := DOWNCONVERT(v1[i+63:i], conv)
+			storeAddr := addr + storeOffset * downSize
+			CASE downSize OF
+			8: MEM[storeAddr] := tmp[63:0]
+			ESAC
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extpackstorelo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_EPI64_NONE: RETURN element[i+63:i]
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_EPI64_NONE: RETURN 8
+	ESAC
+}
+
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 7
+	i := j*63
+	tmp := DOWNCONVERT(v1[i+63:i], conv)
+	storeAddr := addr + storeOffset * downSize
+	CASE downSize OF
+	8: MEM[storeAddr] := tmp[63:0]
+	ESAC
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset * downSize) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extpackstorelo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI64_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed 64-bit integer elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped whent he corresponding mask bit is not set).</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_EPI64_NONE: RETURN element[i+63:i]
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_EPI64_NONE: RETURN 8
+	ESAC
+}
+
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*63
+		tmp := DOWNCONVERT(v1[i+63:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		8: MEM[storeAddr] := tmp[63:0]
+		ESAC
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset * downSize) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extpackstorehi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v1" type="__m512"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PS_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PS_NONE:	   RETURN element[i+31:i]
+	_MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i])
+	_MM_UPCONV_PS_UINT8:   RETURN UInt32ToUInt8(element[i+31:i])
+	_MM_UPCONV_PS_SINT8:   RETURN SInt32ToSInt8(element[i+31:i])
+	_MM_UPCONV_PS_UINT16:  RETURN UInt32ToUInt16(element[i+31:i])
+	_MM_UPCONV_PS_SINT16:  RETURN SInt32ToSInt16(element[i+31:i])
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PS_NONE:	   RETURN 4
+	_MM_UPCONV_PS_FLOAT16: RETURN 2
+	_MM_UPCONV_PS_UINT8:   RETURN 1
+	_MM_UPCONV_PS_SINT8:   RETURN 1
+	_MM_UPCONV_PS_UINT16:  RETURN 2
+	_MM_UPCONV_PS_SINT16:  RETURN 2
+	ESAC
+}
+
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+			foundNext64BytesBoundary = true
+		FI
+	ELSE
+		i := j*32
+		tmp := DOWNCONVERT(v1[i+31:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		4: MEM[storeAddr] := tmp[31:0]
+		2: MEM[storeAddr] := tmp[15:0]
+		1: MEM[storeAddr] := tmp[7:0]
+		ESAC
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extpackstorehi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v1" type="__m512"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PS_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PS_NONE:	   RETURN element[i+31:i]
+	_MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i])
+	_MM_UPCONV_PS_UINT8:   RETURN UInt32ToUInt8(element[i+31:i])
+	_MM_UPCONV_PS_SINT8:   RETURN SInt32ToSInt8(element[i+31:i])
+	_MM_UPCONV_PS_UINT16:  RETURN UInt32ToUInt16(element[i+31:i])
+	_MM_UPCONV_PS_SINT16:  RETURN SInt32ToSInt16(element[i+31:i])
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PS_NONE:    RETURN 4
+	_MM_UPCONV_PS_FLOAT16: RETURN 2
+	_MM_UPCONV_PS_UINT8:   RETURN 1
+	_MM_UPCONV_PS_SINT8:   RETURN 1
+	_MM_UPCONV_PS_UINT16:  RETURN 2
+	_MM_UPCONV_PS_SINT16:  RETURN 2
+	ESAC
+}
+
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+				foundNext64BytesBoundary = true
+			FI
+		ELSE
+			i := j*32
+			tmp := DOWNCONVERT(v1[i+31:i], conv)
+			storeAddr := addr + storeOffset * downSize
+			CASE downSize OF
+			4: MEM[storeAddr] := tmp[31:0]
+			2: MEM[storeAddr] := tmp[15:0]
+			1: MEM[storeAddr] := tmp[7:0]
+			ESAC
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extpackstorelo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v1" type="__m512"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PS_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PS_NONE:	   RETURN element[i+31:i]
+	_MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i])
+	_MM_UPCONV_PS_UINT8:   RETURN UInt32ToUInt8(element[i+31:i])
+	_MM_UPCONV_PS_SINT8:   RETURN SInt32ToSInt8(element[i+31:i])
+	_MM_UPCONV_PS_UINT16:  RETURN UInt32ToUInt16(element[i+31:i])
+	_MM_UPCONV_PS_SINT16:  RETURN SInt32ToSInt16(element[i+31:i])
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PS_NONE:	   RETURN 4
+	_MM_UPCONV_PS_FLOAT16: RETURN 2
+	_MM_UPCONV_PS_UINT8:   RETURN 1
+	_MM_UPCONV_PS_SINT8:   RETURN 1
+	_MM_UPCONV_PS_UINT16:  RETURN 2
+	_MM_UPCONV_PS_SINT16:  RETURN 2
+	ESAC
+}
+
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 15
+	i := j*32
+	tmp := DOWNCONVERT(v1[i+31:i], conv)
+	storeAddr := addr + storeOffset * downSize
+	CASE downSize OF
+	4: MEM[storeAddr] := tmp[31:0]
+	2: MEM[storeAddr] := tmp[15:0]
+	1: MEM[storeAddr] := tmp[7:0]
+	ESAC
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset * downSize) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extpackstorelo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v1" type="__m512"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PS_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed single-precision (32-bit) floating-point elements of "v1" into a byte/word/doubleword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PS_NONE:	   RETURN element[i+31:i]
+	_MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i])
+	_MM_UPCONV_PS_UINT8:   RETURN UInt32ToUInt8(element[i+31:i])
+	_MM_UPCONV_PS_SINT8:   RETURN SInt32ToSInt8(element[i+31:i])
+	_MM_UPCONV_PS_UINT16:  RETURN UInt32ToUInt16(element[i+31:i])
+	_MM_UPCONV_PS_SINT16:  RETURN SInt32ToSInt16(element[i+31:i])
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PS_NONE:    RETURN 4
+	_MM_UPCONV_PS_FLOAT16: RETURN 2
+	_MM_UPCONV_PS_UINT8:   RETURN 1
+	_MM_UPCONV_PS_SINT8:   RETURN 1
+	_MM_UPCONV_PS_UINT16:  RETURN 2
+	_MM_UPCONV_PS_SINT16:  RETURN 2
+	ESAC
+}
+
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		tmp := DOWNCONVERT(v1[i+31:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		4: MEM[storeAddr] := tmp[31:0]
+		2: MEM[storeAddr] := tmp[15:0]
+		1: MEM[storeAddr] := tmp[7:0]
+		ESAC
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset * downSize) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extpackstorehi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PD_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PD_NONE: RETURN element[i+63:i]
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PD_NONE: RETURN 8
+	ESAC
+}
+
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+			foundNext64BytesBoundary = true
+		FI
+	ELSE
+		i := j*64
+		tmp := DOWNCONVERT(v1[i+63:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		8: MEM[storeAddr] := tmp[63:0]
+		ESAC
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extpackstorehi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PD_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PD_NONE: RETURN element[i+63:i]
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PD_NONE: RETURN 8
+	ESAC
+}
+
+storeOffset := 0
+foundNext64BytesBoundary := false
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
+				foundNext64BytesBoundary = true
+			FI
+		ELSE
+			i := j*64
+			tmp := DOWNCONVERT(v1[i+63:i], conv)
+			storeAddr := addr + storeOffset * downSize
+			CASE downSize OF
+			8: MEM[storeAddr] := tmp[63:0]
+			ESAC
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_extpackstorelo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PD_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PD_NONE: RETURN element[i+63:i]
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PD_NONE: RETURN 8
+	ESAC
+}
+
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 7
+	i := j*63
+	tmp := DOWNCONVERT(v1[i+63:i], conv)
+	storeAddr := addr + storeOffset * downSize
+	CASE downSize OF
+	8: MEM[storeAddr] := tmp[63:0]
+	ESAC
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset * downSize) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_extpackstorelo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PD_ENUM"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts and stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream according to "conv" at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). "hint" indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+DOWNCONVERT(element, convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PD_NONE: RETURN element[i+63:i]
+	ESAC
+}
+
+DOWNCONVERTSIZE(convertTo) {
+	CASE converTo OF
+	_MM_UPCONV_PD_NONE: RETURN 8
+	ESAC
+}
+
+storeOffset := 0
+downSize := DOWNCONVERTSIZE(conv)
+addr = mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*63
+		tmp := DOWNCONVERT(v1[i+63:i], conv)
+		storeAddr := addr + storeOffset * downSize
+		CASE downSize OF
+		8: MEM[storeAddr] := tmp[63:0]
+		ESAC
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset * downSize) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_cvtpd_pslo">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="v2" type="__m512d"/>
+	<description>Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst". The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k := j*32
+	dst[k+31:k] := Float64ToFloat32(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2ps" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_cvtpd_pslo">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512d"/>
+	<description>Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Float64ToFloat32(v2[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2ps" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_i32logather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>Loads 8 64-bit integer elements from memory starting at location "mv" at packed 32-bit integer indices stored in the lower half of "index" scaled by "scale" and stores them in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	addr := MEM[mv + index[j] * scale]
+	dst[i+63:i] := addr[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpgatherdq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512i" name="_mm512_mask_i32logather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>Loads 8 64-bit integer elements from memory starting at location "mv" at packed 32-bit integer indices stored in the lower half of "index" scaled by "scale" and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		dst[i+63:i] := addr[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpgatherdq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_i32logather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>Loads 8 double-precision (64-bit) floating-point elements stored at memory locations starting at location "mv" at packed 32-bit integer indices stored in the lower half of "index" scaled by "scale" them in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	addr := MEM[mv + index[j] * scale]
+	dst[i+63:i] := addr[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgatherdpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_i32logather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<description>Loads 8 double-precision (64-bit) floating-point elements from memory starting at location "mv" at packed 32-bit integer indices stored in the lower half of "index" scaled by "scale" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		dst[i+63:i] := addr[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgatherdpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_prefetch_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const*"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+cachev := 0
+FOR j := 0 to 15
+	i := j*32
+	addr := MEM[mv + index[j] * scale]
+	cachev[i+31:i] := addr[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="vgatherpf0dps" form="m512 {k}" xed=""/>
+	<instruction name="vgatherpf1dps" form="m512 {k}" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_i32loscatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void*"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="scale" type="int"/>
+	<description>Stores 8 packed double-precision (64-bit) floating-point elements in "v1" and to memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale".</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	addr[i+63:i] := v1[k+63:j]
+ENDFOR
+	</operation>
+	<instruction name="vscatterdpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_i32loscatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="scale" type="int"/>
+	<description>Stores 8 packed double-precision (64-bit) floating-point elements in "v1" to memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory.</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		i := j*64
+		addr[i+63:i] := v1[k+63:j]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vscatterdpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_i32loscatter_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void*"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Stores 8 packed 64-bit integer elements located in "v1" and stores them in memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale".</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	addr[i+63:i] := v1[k+63:j]
+ENDFOR
+	</operation>
+	<instruction name="vpscatterdq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_i32loscatter_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Stores 8 packed 64-bit integer elements located in "v1" and stores them in memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale" using writemask "k" (elements whose corresponding mask bit is not set are not written to memory).</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		addr[i+63:i] := v1[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vpscatterdq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_prefetch_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void*"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	addr := MEM[mv + index[j] * scale]
+	CASE hint OF
+	_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
+	_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
+	_MM_HINT_T2: PrefetchL2WithT1HintNonTemporal(addr[i+31:i])
+	_MM_HINT_NTA: PrefetchL1WithT0HintNonTemporal(addr[i+31:i])
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="vscatterpf0dps" form="m512 {k}" xed=""/>
+	<instruction name="vscatterpf1dps" form="m512 {k}" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_prefetch_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512PF/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale". The "hint" parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. Only those elements whose corresponding mask bit in "k" is set are loaded into the desired cache.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		CASE hint OF
+		_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
+		_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
+		_MM_HINT_T2: PrefetchL2WithT1HintNonTemporal(addr[i+31:i])
+		_MM_HINT_NTA: PrefetchL1WithT0HintNonTemporal(addr[i+31:i])
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vscatterpf0dps" form="m512 {k}" xed=""/>
+	<instruction name="vscatterpf1dps" form="m512 {k}" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_loadunpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*4 % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*32
+		tmp := MEM[addr + loadOffset*4]
+		dst[i+31:i] := tmp[i+31:i]
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_loadunpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mt" type="void const *"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*4 % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*32
+			tmp := MEM[addr + loadOffset*4]
+			dst[i+31:i] := tmp[i+31:i]
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_loadunpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expanded into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr = mt
+FOR j := 0 to 15
+	i := j*32
+	tmp := MEM[addr + loadOffset*4]
+	dst[i+31:i] := tmp[i+31:i]
+	loadOffset := loadOffset + 1
+	IF (mt + loadOffset * 4) % 64 == 0
+		break
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackld" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_loadunpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expands them into packed 32-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr = mt
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp := MEM[addr + loadOffset*4]
+		dst[i+31:i] := tmp[i+31:i]
+		loadOffset := loadOffset + 1
+		IF (mt + loadOffset * 4) % 64 == 0
+			break
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackld" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_loadunpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*8) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*64
+		tmp := MEM[addr + loadOffset*8]
+		dst[i+63:i] := tmp[i+63:i]
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_loadunpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*8) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*64
+			tmp := MEM[addr + loadOffset*8]
+			dst[i+63:i] := tmp[i+63:i]
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_loadunpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr = mt
+FOR j := 0 to 7
+	i := j*64
+	tmp := MEM[addr + loadOffset*8]
+	dst[i+63:i] := tmp[i+63:i]
+	loadOffset := loadOffset + 1
+	IF (addr + loadOffset*8 % 64) == 0
+		break
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_loadunpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr = mt
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp := MEM[addr + loadOffset*8]
+		dst[i+63:i] := tmp[i+63:i]
+		loadOffset := loadOffset + 1
+		IF (addr + loadOffset*8 % 64) == 0
+			break
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklq" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_loadunpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*4 % 64) == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*32
+		tmp := MEM[addr + loadOffset*4]
+		dst[i+31:i] := tmp[i+31:i]
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_loadunpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the high-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*4 % 64) == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*32
+			tmp := MEM[addr + loadOffset*4]
+			dst[i+31:i] := tmp[i+31:i]
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_loadunpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr = mt
+FOR j := 0 to 15
+	i := j*32
+	tmp := MEM[addr + loadOffset*4]
+	dst[i+31:i] := tmp[i+31:i]
+	loadOffset := loadOffset + 1
+	IF (mt + loadOffset * 4) % 64 == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_loadunpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted doublewords that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those doublewords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr = mt
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp := MEM[addr + loadOffset*4]
+		dst[i+31:i] := tmp[i+31:i]
+		loadOffset := loadOffset + 1
+		IF (mt + loadOffset * 4) % 64 == 0
+			break
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_loadunpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF (addr + (loadOffset + 1)*8) % 64 == 0
+			foundNext64BytesBoundary := true
+		FI
+	ELSE
+		i := j*64
+		tmp := MEM[addr + loadOffset*8]
+		dst[i+63:i] := tmp[i+63:i]
+	FI
+	loadOffset := loadOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_loadunpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF (addr + (loadOffset + 1)*8) % 64 == 0
+				foundNext64BytesBoundary := true
+			FI
+		ELSE
+			i := j*64
+			tmp := MEM[addr + loadOffset*8]
+			dst[i+63:i] := tmp[i+63:i]
+		FI
+		loadOffset := loadOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpackhpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_loadunpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point elements in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src".</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr = mt
+FOR j := 0 to 7
+	i := j*64
+	tmp := MEM[addr + loadOffset*8]
+	dst[i+63:i] := tmp[i+63:i]
+	loadOffset := loadOffset + 1
+	IF ((addr + 8*loadOffset) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_loadunpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mt" type="void const*"/>
+	<description>Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point values in "dst". The initial values of "dst" are copied from "src". Only those converted quad that occur before first 64-byte-aligned address following "mt" are loaded. Elements in the resulting vector that do not map to those quadwords are taken from "src". Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+dst[511:0] := src[511:0]
+loadOffset := 0
+addr = mt
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp := MEM[addr + loadOffset*8]
+		dst[i+63:i] := tmp[i+63:i]
+		loadOffset := loadOffset + 1
+		IF ((addr + 8*loadOffset) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vloadunpacklpd" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_packstorehi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="v1" type="__m512i"/>
+	<description>Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*4) % 64) == 0
+			foundNext64BytesBoundary = true
+		FI
+	ELSE
+		i := j*32
+		MEM[addr + storeOffset*4] := v1[i+31:i]
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_packstorehi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v1" type="__m512i"/>
+	<description>Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*4) % 64) == 0
+				foundNext64BytesBoundary = true
+			FI
+		ELSE
+			i := j*32
+			MEM[addr + storeOffset*4] := v1[i+31:i]
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_packstorelo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="v1" type="__m512i"/>
+	<description>Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt").</description>
+	<operation>
+storeOffset := 0
+addr = mt
+FOR j := 0 to 15
+	i := j*32
+	MEM[addr + storeOffset*4] := v1[i+31:i]
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset*4) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstoreld" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_packstorelo_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v1" type="__m512i"/>
+	<description>Stores packed 32-bit integer elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+addr = mt
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		MEM[addr + storeOffset*4] := v1[i+31:i]
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset*4) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstoreld" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_packstorehi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="v1" type="__m512i"/>
+	<description>Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*8) % 64) == 0
+			foundNext64BytesBoundary = true
+		FI
+	ELSE
+		i := j*64
+		MEM[addr + storeOffset*8] := v1[i+63:i]
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_packstorehi_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v1" type="__m512i"/>
+	<description>Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*8) % 64) == 0
+				foundNext64BytesBoundary = true
+			FI
+		ELSE
+			i := j*64
+			MEM[addr + storeOffset*8] := v1[i+63:i]
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_packstorelo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="v1" type="__m512i"/>
+	<description>Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt").</description>
+	<operation>
+storeOffset := 0
+addr = mt
+FOR j := 0 to 7
+	i := j*64
+	MEM[addr + storeOffset*8] := v1[i+63:i]
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset*8) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_packstorelo_epi64">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v1" type="__m512i"/>
+	<description>Stores packed 64-bit integer elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+addr = mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*64
+		MEM[addr + storeOffset*8] := v1[i+63:i]
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset*8) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelq" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_packstorehi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="v1" type="__m512"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 15
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*4) % 64) == 0
+			foundNext64BytesBoundary = true
+		FI
+	ELSE
+		i := j*32
+		MEM[addr + storeOffset*4] := v1[i+31:i]
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_packstorehi_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v1" type="__m512"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 15
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*4) % 64) == 0
+				foundNext64BytesBoundary = true
+			FI
+		ELSE
+			i := j*32
+			MEM[addr + storeOffset*4] := v1[i+31:i]
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_packstorelo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="v1" type="__m512"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt").</description>
+	<operation>
+storeOffset := 0
+addr = mt
+FOR j := 0 to 15
+	i := j*32
+	MEM[addr + storeOffset*4] := v1[i+31:i]
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset*4) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_packstorelo_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v1" type="__m512"/>
+	<description>Stores packed single-precision (32-bit) floating-point elements of "v1" into a doubleword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+addr = mt
+FOR j := 0 to 15
+	IF k[j]
+		i := j*32
+		MEM[addr + storeOffset*4] := v1[i+31:i]
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset*4) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelps" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_packstorehi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="v1" type="__m512d"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 7
+	IF foundNext64BytesBoundary == false
+		IF ((addr + (storeOffset + 1)*8) % 64) == 0
+			foundNext64BytesBoundary = true
+		FI
+	ELSE
+		i := j*64
+		MEM[addr + storeOffset*4] := v1[i+63:i]
+	FI
+	storeOffset := storeOffset + 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_packstorehi_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v1" type="__m512d"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+foundNext64BytesBoundary := false
+addr = mt-64
+FOR j := 0 to 7
+	IF k[j]
+		IF foundNext64BytesBoundary == false
+			IF ((addr + (storeOffset + 1)*8) % 64) == 0
+				foundNext64BytesBoundary = true
+			FI
+		ELSE
+			i := j*64
+			MEM[addr + storeOffset*4] := v1[i+63:i]
+		FI
+		storeOffset := storeOffset + 1
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorehpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_packstorelo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="v1" type="__m512d"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt").</description>
+	<operation>
+storeOffset := 0
+addr = mt
+FOR j := 0 to 7
+	i := j*64
+	MEM[addr + storeOffset*8] := v1[i+63:i]
+	storeOffset := storeOffset + 1
+	IF ((addr + storeOffset*8) % 64) == 0
+		BREAK
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_packstorelo_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mt" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v1" type="__m512d"/>
+	<description>Stores packed double-precision (64-bit) floating-point elements of "v1" into a quadword stream at a logically mapped starting address "mt", storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing "mt"). Elements are loaded from memory according to element selector "k" (elements are skipped when the corresponding mask bit is not set).</description>
+	<operation>
+storeOffset := 0
+addr = mt
+FOR j := 0 to 7
+	IF k[j]
+		i := j*64
+		MEM[addr + storeOffset*8] := v1[i+63:i]
+		storeOffset := storeOffset + 1
+		IF ((addr + storeOffset*8) % 64) == 0
+			BREAK
+		FI
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackstorelpd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="unsigned int" name="_mm_countbits_32">
+	<CPUID>KNCNI</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="r1" type="unsigned int"/>
+	<description>Counts the number of set bits in 32-bit unsigned integer "r1", returning the results in "dst".</description>
+	<operation>
+dst[31:0] := PopCount(r1[31:0])
+	</operation>
+	<instruction name="popcnt" form="r32, r32" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="unsigned __int64" name="_mm_countbits_64">
+	<CPUID>KNCNI</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="r1" type="unsigned __int64"/>
+	<description>Counts the number of set bits in 64-bit unsigned integer "r1", returning the results in "dst".</description>
+	<operation>
+dst[63:0] := PopCount(r1[63:0])
+	</operation>
+	<instruction name="popcnt" form="r64, r64" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kmovlhb">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<description>Inserts the low byte of mask "k2" into the high byte of "dst", and copies the low byte of "k1" to the low byte of "dst".</description>
+	<operation>
+dst[7:0] := k1[7:0]
+dst[15:8] := k2[7:0]
+	</operation>
+	<instruction name="kmerge2l1l" form="k, k" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_cvtfxpnt_roundpd_epi32lo">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector "v2" to 32-bit integer elements, storing them in the lower half of "dst". The elements in the upper half of "dst" are set to 0.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k := j*32
+	dst[k+31:k] := Float64ToInt32(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtfxpntpd2dq" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_cvtfxpnt_roundpd_epi32lo">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector "v2" to 32-bit integer elements, storing them in the lower half of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements in the upper half of "dst" are set to 0.
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Float64ToInt32(v2[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtfxpntpd2dq" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_cvtfxpnt_round_adjustepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="v2" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<parameter varname="expadj" type="_MM_EXP_ADJ_ENUM"/>
+	<description>Performs element-by-element conversion of packed 32-bit integer elements in "v2" to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using "expadj", storing the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := Int32ToFloat32(v2[i+31:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
+	_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
+	_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
+	_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
+	_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
+	_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
+	_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
+	_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtfxpntdq2ps" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_abs_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512"/>
+	<description>Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ABS(v2[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpandd" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+
+<intrinsic tech="AVX-512/KNC" rettype="__m512" name="_mm512_mask_abs_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="v2" type="__m512"/>
+	<description>Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(v2[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpandd" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_abs_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="v2" type="__m512d"/>
+	<description>Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ABS(v2[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpandq" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+
+<intrinsic tech="AVX-512/KNC" rettype="__m512d" name="_mm512_mask_abs_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="v2" type="__m512d"/>
+	<description>Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(v2[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpandq" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_log2ae23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a" with absolute error of 2^(-23) and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := Log2ae23(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vlog2ps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_log2ae23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a" with absolute error of 2^(-23) and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Log2ae23(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vlog2ps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_fmadd_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<description>Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmadd231d" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_fmadd_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<description>Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmadd231d" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask3_fmadd_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Multiply packed 32-bit integer elements in "a" and "b", add the intermediate result to packed elements in "c" and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmadd231d" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_fmadd233_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply packed 32-bit integer elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	base := (j &amp; ~0x3) * 32
+	scale[31:0] := b[base+63:base+32]
+	bias[31:0]  := b[base+31:base]
+	dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmadd233d" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_fmadd233_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply packed 32-bit integer elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		base := (j &amp; ~0x3) * 32
+		scale[31:0] := b[base+63:base+32]
+		bias[31:0]  := b[base+31:base]
+		dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmadd233d" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_fmadd233_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	base := (j &amp; ~0x3) * 32
+	scale[31:0] := b[base+63:base+32]
+	bias[31:0]  := b[base+31:base]
+	dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vfmadd233ps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_fmadd233_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		base := (j &amp; ~0x3) * 32
+		scale[31:0] := b[base+63:base+32]
+		bias[31:0]  := b[base+31:base]
+		dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vfmadd233ps" form="zmm {k}, zmm, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_maxabs_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgmaxabsps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_maxabs_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgmaxabsps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_gmax_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Determines the maximum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgmaxps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_gmax_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgmaxps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_gmaxabs_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgmaxabsps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_gmaxabs_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgmaxabsps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_gmax_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Determines the maximum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgmaxpd" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_gmax_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgmaxpd" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_gmin_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Determines the minimum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgminps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_gmin_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgminps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_gmin_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Determines the minimum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgminpd" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_gmin_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in "a" and "b", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vgminpd" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mulhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Performs element-by-element multiplication between packed 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) &gt;&gt; 32
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhd" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_mulhi_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Performs element-by-element multiplication between packed 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) &gt;&gt; 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhd" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mulhi_epu32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Performs element-by-element multiplication between packed unsigned 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) &gt;&gt; 32
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhud" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_mulhi_epu32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Performs element-by-element multiplication between packed unsigned 32-bit integer elements in "a" and "b" and stores the high 32 bits of each result into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) &gt;&gt; 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhud" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_permute4f128_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="_MM_PERM_ENUM"/>
+	<description>Permutes 128-bit blocks of the packed 32-bit integer vector "a" using constant "imm8". The results are stored in "dst".</description>
+	<operation>
+SELECT4(src, control) {
+	CASE control[1:0] OF
+	0: tmp[127:0] := src[127:0]
+	1: tmp[127:0] := src[255:128]
+	2: tmp[127:0] := src[383:256]
+	3: tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+FOR j := 0 to 3
+	i := j*128
+	n := j*2
+	dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermf32x4" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_permute4f128_epi32">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="_MM_PERM_ENUM"/>
+	<description>Permutes 128-bit blocks of the packed 32-bit integer vector "a" using constant "imm8". The results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control) {
+	CASE control[1:0] OF
+	0: tmp[127:0] := src[127:0]
+	1: tmp[127:0] := src[255:128]
+	2: tmp[127:0] := src[383:256]
+	3: tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp[511:0] := 0
+FOR j := 0 to 4
+	i := j*128
+	n := j*2
+	tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
+ENDFOR
+FOR j := 0 to 15
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermf32x4" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_rcp23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of precision, storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrcp23ps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_rcp23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of precision, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrcp23ps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<parameter varname="expadj" type="_MM_EXP_ADJ_ENUM"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value using "expadj" and in the direction of "rounding", and store the results as packed single-precision floating-point elements in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
+	_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
+	_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
+	_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
+	_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
+	_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
+	_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
+	_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vroundps" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<parameter varname="expadj" type="_MM_EXP_ADJ_ENUM"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value using "expadj" and in the direction of "rounding", and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ROUND(a[i+31:i])
+		CASE expadj OF
+		_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
+		_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
+		_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
+		_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
+		_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
+		_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
+		_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
+		_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vroundps" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_roundfxpnt_adjust_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<parameter varname="expadj" type="_MM_EXP_ADJ_ENUM"/>
+	<description>Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ROUND(a[i+31:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
+	_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
+	_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
+	_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
+	_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
+	_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
+	_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
+	_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrndfxpntps" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_roundfxpnt_adjust_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="rounding" type="int"/>
+	<parameter varname="expadj" type="_MM_EXP_ADJ_ENUM"/>
+	<description>Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ROUND(a[i+31:i])
+		CASE expadj OF
+		_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
+		_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
+		_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
+		_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
+		_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
+		_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
+		_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
+		_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrndfxpntps" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_roundfxpnt_adjust_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<parameter varname="expadj" type="_MM_EXP_ADJ_ENUM"/>
+	<description>Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+	CASE expadj OF
+	_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
+	_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
+	_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
+	_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
+	_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
+	_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
+	_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
+	_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrndfxpntpd" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_roundfxpnt_adjust_pd">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<parameter varname="expadj" type="_MM_EXP_ADJ_ENUM"/>
+	<description>Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in "a" using "expadj" and in the direction of "rounding" and stores results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ROUND(a[i+63:i])
+		CASE expadj OF
+		_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
+		_MM_EXPADJ_4:	dst[i+31:i] = dst[i+31:i] * 2**4
+		_MM_EXPADJ_5:	dst[i+31:i] = dst[i+31:i] * 2**5
+		_MM_EXPADJ_8:	dst[i+31:i] = dst[i+31:i] * 2**8
+		_MM_EXPADJ_16:   dst[i+31:i] = dst[i+31:i] * 2**16
+		_MM_EXPADJ_24:   dst[i+31:i] = dst[i+31:i] * 2**24
+		_MM_EXPADJ_31:   dst[i+31:i] = dst[i+31:i] * 2**31
+		_MM_EXPADJ_32:   dst[i+31:i] = dst[i+31:i] * 2**32
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrndfxpntpd" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_rsqrt23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of accuracy and stores the result in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := Sqrt(1.0 / a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrsqrt23ps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_rsqrt23_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in "a" to 23 bits of accuracy and stores the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Sqrt(1.0 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrsqrt23ps" form="zmm {k}, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_scale_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in "b", storing results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vscaleps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_scale_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponenet is the corresponding 32-bit integer element in "b", storing results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vscaleps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_scale_round_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exponent, where the exponenet is the corresponding 32-bit integer element in "b", storing results in "dst". Intermediate elements are rounded using "rounding".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vscaleps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_scale_round_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Scales each single-precision (32-bit) floating-point element in "a" by multiplying it by 2**exp, where the exp is the corresponding 32-bit integer element in "b", storing results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Results are rounded using constant "rounding".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vscaleps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_acos_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ACOS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_acos_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ACOS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_acos_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ACOS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_acos_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ACOS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_acosh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ACOSH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_acosh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ACOSH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_acosh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ACOSH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_acosh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ACOSH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_asin_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ASIN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_asin_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ASIN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_asin_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ASIN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_asin_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ASIN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_asinh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ASINH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_asinh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ASINH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_asinh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ASINH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_asinh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ASINH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_atan2_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_atan2_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_atan2_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_atan2_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_atan_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ATAN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_atan_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ATAN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_atan_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ATAN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_atan_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ATAN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_atanh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ATANH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_atanh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ATANH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_atanh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ATANH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_atanh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ATANH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_cbrt_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := CubeRoot(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_cbrt_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := CubeRoot(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_cbrt_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := CubeRoot(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_cbrt_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := CubeRoot(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_cdfnorm_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := CDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_cdfnorm_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := CDFNormal(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_cdfnorm_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := CDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_cdfnorm_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := CDFNormal(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_cdfnorminv_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_cdfnorminv_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := InverseCDFNormal(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_cdfnorminv_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_cdfnorminv_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := InverseCDFNormal(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_ceil_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := CEIL(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_ceil_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := CEIL(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_ceil_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := CEIL(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_ceil_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := CEIL(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_cos_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_cos_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := COS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_cos_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_cos_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := COS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_cosd_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := COSD(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_cosd_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := COSD(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_cosd_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := COSD(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_cosd_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := COSD(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_cosh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := COSH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_cosh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := COSH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_cosh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := COSH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_cosh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := COSH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_erf_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ERF(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_erf_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ERF(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_erfc_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_erfc_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 1.0 - ERF(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_erf_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ERF(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_erf_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ERF(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_erfc_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := 1.0 - ERF(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_erfc_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 1.0 - ERF(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_erfinv_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_erfinv_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 1.0 / ERF(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_erfinv_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := 1.0 / ERF(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_erfinv_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 1.0 / ERF(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_erfcinv_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_erfcinv_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_erfcinv_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_erfcinv_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Probability/Statistics</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_exp10_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := 10^(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_exp10_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 10^(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_exp10_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := 10^(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_exp10_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 10^(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_exp2_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := 2^(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_exp2_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 2^(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_exp2_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := 2^(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_exp2_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 2^(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_exp_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := e^(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_exp_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := e^(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_exp_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := e^(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_exp_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := e^(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_expm1_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := e^(a[i+63:i]) - 1.0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_expm1_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := e^(a[i+63:i]) - 1.0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_expm1_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := e^(a[i+31:i]) - 1.0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_expm1_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := e^(a[i+31:i]) - 1.0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_floor_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := FLOOR(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_floor_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FLOOR(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_floor_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := FLOOR(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_floor_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FLOOR(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_hypot_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_hypot_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_hypot_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_hypot_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_div_epi32" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_mask_div_epi32" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_div_epi8" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := 8*j
+	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_div_epi16" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_div_epi64" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_invsqrt_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := InvSQRT(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_invsqrt_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := InvSQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_invsqrt_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := InvSQRT(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_invsqrt_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := InvSQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_rem_epi32" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_mask_rem_epi32" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_rem_epi8" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_rem_epi16" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_rem_epi64" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_log10_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := log10(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_log10_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := log10(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_log10_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := log10(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_log10_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := log10(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_log1p_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ln(1.0 + a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_log1p_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ln(1.0 + a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_log1p_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ln(1.0 + a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_log1p_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ln(1.0 + a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_log2_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := log2(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_log2_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := log2(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_log2_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := log2(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+	<instruction name="vlog2ps" form="zmm {k}, zmm" xed=""/>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_log2_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := log2(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vlog2ps" form="zmm {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_log_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ln(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_log_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ln(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_log_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ln(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_log_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ln(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_logb_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_logb_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_logb_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_logb_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_nearbyint_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := NearbyInt(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_nearbyint_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := NearbyInt(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_nearbyint_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := NearbyInt(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_nearbyint_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := NearbyInt(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_pow_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_pow_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_pow_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_pow_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_recip_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (1 / a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_recip_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (1 / a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_recip_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (1 / a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_recip_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Elementary Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (1 / a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_rint_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RoundToNearestEven(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_rint_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundToNearestEven(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_rint_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RoundToNearestEven(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_rint_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundToNearestEven(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_svml_round_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ROUND(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_svml_round_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ROUND(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i] 
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_sin_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_sin_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SIN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_sin_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_sin_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SIN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_sinh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SINH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_sinh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SINH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_sinh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SINH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_sinh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SINH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_sind_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SIND(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_sind_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SIND(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_sind_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SIND(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_sind_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SIND(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_tan_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TAN(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_tan_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TAN(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_tan_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TAN(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_tan_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TAN(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_tand_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TAND(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_tand_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TAND(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_tand_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TAND(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_tand_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TAND(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_tanh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TANH(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_tanh_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TANH(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_tanh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TANH(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_tanh_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TANH(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_trunc_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := TRUNCATE(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_trunc_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := TRUNCATE(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_trunc_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := TRUNCATE(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_trunc_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := TRUNCATE(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_div_epu32" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_mask_div_epu32" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_div_epu8" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := 8*j
+	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_div_epu16" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_div_epu64" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_rem_epu32" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_mask_rem_epu32" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_rem_epu8" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := 8*j
+	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_rem_epu16" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512i" name="_mm512_rem_epu64" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 64*j
+	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="float" name="_mm512_reduce_gmin_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst".</description>
+	<operation>
+min = a[31:0]
+FOR j := 1 to 15
+	i := j*32
+	dst = FpMin(min, a[i+31:i])
+ENDFOR
+dst := min
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="float" name="_mm512_mask_reduce_gmin_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst" using writemask "k" (elements are ignored when the corresponding mask bit is not set).</description>
+	<operation>
+min = a[31:0]
+FOR j := 1 to 15
+	i := j*32
+	IF k[j]
+		CONTINUE
+	ELSE
+		dst = FpMin(min, a[i+31:i])
+	FI
+ENDFOR
+dst := min
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="double" name="_mm512_reduce_gmin_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst".</description>
+	<operation>
+min = a[63:0]
+FOR j := 1 to 7
+	i := j*64
+	dst = FpMin(min, a[i+63:i])
+ENDFOR
+dst := min
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="double" name="_mm512_mask_reduce_gmin_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).</description>
+	<operation>
+min = a[63:0]
+FOR j := 1 to 7
+	i := j*64
+	IF k[j]
+		CONTINUE
+	ELSE
+		dst = FpMin(min, a[i+63:i])
+	FI
+ENDFOR
+dst := min
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="float" name="_mm512_reduce_gmax_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512"/>
+	<description>Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst".</description>
+	<operation>
+max = a[31:0]
+FOR j := 1 to 15
+	i := j*32
+	dst = FpMax(max, a[i+31:i])
+ENDFOR
+dst := max
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="float" name="_mm512_mask_reduce_gmax_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).</description>
+	<operation>
+max = a[31:0]
+FOR j := 1 to 15
+	i := j*32
+	IF k[j]
+		CONTINUE
+	ELSE
+		dst = FpMax(max, a[i+31:i])
+	FI
+ENDFOR
+dst := max
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="double" name="_mm512_reduce_gmax_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst".</description>
+	<operation>
+max = a[63:0]
+FOR j := 1 to 7
+	i := j*64
+	dst = FpMax(max, a[i+63:i])
+ENDFOR
+dst := max
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="double" name="_mm512_mask_reduce_gmax_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Special Math Functions</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in "a" and stores the result in "dst". Bitmask "k" is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).</description>
+	<operation>
+max = a[63:0]
+FOR j := 1 to 7
+	i := j*64
+	IF k[j]
+		CONTINUE
+	ELSE
+		dst = FpMax(max, a[i+63:i])
+	FI
+ENDFOR
+dst := max
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="int" name="_mm_tzcnti_32">
+	<CPUID>KNCNI</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="int"/>
+	<parameter varname="x" type="unsigned int"/>
+	<description>Counts the number of trailing bits in unsigned 32-bit integer "x" starting at bit "a" storing the result in "dst".</description>
+	<operation>
+count := 0
+FOR j := a to 31
+	IF NOT(x[j]  1)
+		count := count + 1
+	FI
+ENDFOR
+dst := count
+	</operation>
+	<instruction name="tzcnti" form="r32, r32" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__int64" name="_mm_tzcnti_64">
+	<CPUID>KNCNI</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__int64"/>
+	<parameter varname="x" type="unsigned __int64"/>
+	<description>Counts the number of trailing bits in unsigned 64-bit integer "x" starting at bit "a" storing the result in "dst".</description>
+	<operation>
+count := 0
+FOR j := a to 63
+	IF NOT(x[j]  1)
+		count := count + 1
+	FI
+ENDFOR
+dst := count
+	</operation>
+	<instruction name="tzcnti" form="r64, r64" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm_delay_32">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<parameter varname="r1" type="unsigned int"/>
+	<description>Stalls a thread without blocking other threads for 32-bit unsigned integer "r1" clock cycles.</description>
+	<operation>
+BlockThread(r1)
+	</operation>
+	<instruction name="delay" form="r32" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm_delay_64">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<parameter varname="r1" type="unsigned __int64"/>
+	<description>Stalls a thread without blocking other threads for 64-bit unsigned integer "r1" clock cycles.</description>
+	<operation>
+BlockThread(r1)
+	</operation>
+	<instruction name="delay" form="r64" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm_spflt_32">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<parameter varname="r1" type="unsigned int"/>
+	<description>Set performance monitoring filtering mask to 32-bit unsigned integer "r1".</description>
+	<operation>
+SetPerfMonMask(r1[31:0])
+	</operation>
+	<instruction name="spflt" form="r" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm_spflt_64">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<parameter varname="r1" type="unsigned __int64"/>
+	<description>Set performance monitoring filtering mask to 64-bit unsigned integer "r1".</description>
+	<operation>
+SetPerfMonMask(r1[63:0])
+	</operation>
+	<instruction name="spflt" form="r" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm_clevict">
+	<CPUID>KNCNI</CPUID>
+	<category>General Support</category>
+	<parameter varname="ptr" type="const void *"/>
+	<parameter varname="level" type="int"/>
+	<description>Evicts the cache line containing the address "ptr" from cache level "level" (can be either 0 or 1).</description>
+	<operation>
+CacheLineEvict(ptr, level)
+	</operation>
+	<instruction name="clevict0" form="m" xed=""/>
+	<instruction name="clevict1" form="m" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kandnr">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<description>Performs a bitwise AND operation between NOT of "k2" and "k1", storing the result in "dst".</description>
+	<operation>
+dst[15:0] := NOT(k2[15:0]) &amp; k1[15:0]
+	</operation>
+	<instruction name="kandnr" form="k, k" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kswapb">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<description>Moves high byte from "k2" to low byte of "k1", and moves low byte of "k2" to high byte of "k1".</description>
+	<operation>
+tmp[7:0] := k2[15:8]
+k2[15:8] := k1[7:0]
+k1[7:0]  := tmp[7:0]
+
+tmp[7:0] := k2[7:0]
+k2[7:0]  := k1[15:8]
+k1[15:8] := tmp[7:0]
+	</operation>
+	<instruction name="kmerge2l1h" form="k, k" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="int" name="_mm512_kortestz">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<description>Performs bitwise OR between "k1" and "k2", storing the result in "dst". ZF flag is set if "dst" is 0.</description>
+	<operation>
+dst[15:0] := k1[15:0] | k2[15:0]
+IF dst = 0
+	SetZF()
+FI
+	</operation>
+	<instruction name="kortest" form="k, k" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="int" name="_mm512_kortestc">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<description>Performs bitwise OR between "k1" and "k2", storing the result in "dst". CF flag is set if "dst" consists of all 1's.</description>
+	<operation>
+dst[15:0] := k1[15:0] | k2[15:0]
+IF PopCount(dst[15:0]) = 16
+	SetCF()
+FI
+	</operation>
+	<instruction name="kortest" form="k, k" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm512_kortestz">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<description>Performs bitwise OR between "k1" and "k2", storing the result in "dst". ZF flag is set if "dst" is 0.</description>
+	<operation>
+dst[15:0] := k1[15:0] | k2[15:0]
+IF dst = 0
+	SetZF()
+FI
+	</operation>
+	<instruction name="kortestw" form="k, k" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm512_kortestc">
+	<type>Mask</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<description>Performs bitwise OR between "k1" and "k2", storing the result in "dst". CF flag is set if "dst" consists of all 1's.</description>
+	<operation>
+dst[15:0] := k1[15:0] | k2[15:0]
+IF PopCount(dst[15:0]) = 16
+	SetCF()
+FI
+	</operation>
+	<instruction name="kortestw" form="k, k" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="int" name="_mm512_mask2int">
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<description>Converts bit mask "k1" into an integer value, storing the results in "dst".</description>
+	<operation>
+dst := ZeroExtend(k1)
+	</operation>
+	<header>immintrin.h</header>
+	<instruction name="kmov" form="r32, k" xed=""/>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="int" name="_mm512_mask2int">
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<description>Converts bit mask "k1" into an integer value, storing the results in "dst".</description>
+	<operation>
+dst := ZeroExtend(k1)
+	</operation>
+	<header>immintrin.h</header>
+	<instruction name="kmovw" form="r32, k" xed=""/>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_int2mask">
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="mask" type="int"/>
+	<description>Converts integer "mask" into bitmask, storing the result in "dst".</description>
+	<operation>
+dst := mask[15:0]
+	</operation>
+	<header>immintrin.h</header>
+	<instruction name="kmov" form="k, r32" xed=""/>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_int2mask">
+	<CPUID>AVX512F</CPUID>
+	<category>Mask</category>
+	<parameter varname="mask" type="int"/>
+	<description>Converts integer "mask" into bitmask, storing the result in "dst".</description>
+	<operation>
+dst := mask[15:0]
+	</operation>
+	<header>immintrin.h</header>
+	<instruction name="kmovw" form="k, r32" xed=""/>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__int64" name="_mm512_kconcathi_64">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<description>Packs masks "k1" and "k2" into the high 32 bits of "dst". The rest of "dst" is set to 0.</description>
+	<operation>
+dst[63:48] := k1[15:0]
+dst[47:32] := k2[15:0]
+dst[31:0]  := 0
+	</operation>
+	<instruction name="kconcath" form="r, k, k" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__int64" name="_mm512_kconcatlo_64">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<description>Packs masks "k1" and "k2" into the low 32 bits of "dst". The rest of "dst" is set to 0.</description>
+	<operation>
+dst[31:16] := k1[15:0]
+dst[15:0]  := k2[15:0]
+dst[63:32] := 0
+	</operation>
+	<instruction name="kconcatl" form="r, k, k" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kextract_64">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="a" type="__int64"/>
+	<parameter varname="b" type="const int"/>
+	<description>Extracts 16-bit value "b" from 64-bit integer "a", storing the result in "dst".</description>
+	<operation>
+CASE b of
+0: dst[15:0] := a[63:48]
+1: dst[15:0] := a[47:32]
+2: dst[15:0] := a[31:16]
+3: dst[15:0] := a[15:0]
+ESAC
+dst[MAX:15] := 0
+	</operation>
+	<instruction name="kextract" form="k, r, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_fmadd233_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	base := (j &amp; ~0x3) * 32
+	scale[31:0] := b[base+63:base+32]
+	bias[31:0]  := b[base+31:base]
+	dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vfmadd233ps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_fmadd233_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of "a" and by element 1 of the corresponding 4-element set from "b", add the intermediate result to element 0 of the corresponding 4-element set from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		base := (j &amp; ~0x3) * 32
+		scale[31:0] := b[base+63:base+32]
+		bias[31:0]  := b[base+31:base]
+		dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vfmadd233ps" form="zmm {k}, zmm, m512" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_i64extgather_epi32lo" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const*"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI32_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 single-precision (32-bit) memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	CASE conv OF
+	_MM_UPCONV_EPI32_NONE:
+		dst[i+31:i] := addr[i+31:i]
+	_MM_UPCONV_EPI32_UINT8:
+		n := j*8
+		dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
+	_MM_UPCONV_EPI32_SINT8:
+		n := j*8
+		dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
+	_MM_UPCONV_EPI32_UINT16:
+		n := j*16
+		dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
+	_MM_UPCONV_EPI32_SINT16:
+		n := j*16
+		dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
+	ESAC
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_i64extgather_epi32lo" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const*"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI32_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 single-precision (32-bit) memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" using "conv" to 32-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_EPI32_NONE:
+			dst[i+31:i] := addr[i+31:i]
+		_MM_UPCONV_EPI32_UINT8:
+			n := j*8
+			dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
+		_MM_UPCONV_EPI32_SINT8:
+			n := j*8
+			dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
+		_MM_UPCONV_EPI32_UINT16:
+			n := j*16
+			dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
+		_MM_UPCONV_EPI32_SINT16:
+			n := j*16
+			dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_i64extgather_epi64" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const*"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI64_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 double-precision (64-bit) memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst". "hint" indicates to the processor whether the load is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	addr := MEM[mv + index[j] * scale]
+	CASE conv OF
+	_MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i]
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_i64extgather_epi64" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const*"/>
+	<parameter varname="conv" type="_MM_UPCONV_EPI64_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 double-precision (64-bit) memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" using "conv" to 64-bit integer elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the load is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		CASE conv OF
+		_MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i]
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_i64extgather_pslo" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in the lower half of "dst". "hint" indicates to the processor whether the load is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	CASE conv OF
+	_MM_UPCONV_PS_NONE:
+		dst[i+31:i] := addr[i+31:i]
+	_MM_UPCONV_PS_FLOAT16:
+		n := j*16
+		dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
+	_MM_UPCONV_PS_UINT8:
+		n := j*8
+		dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
+	_MM_UPCONV_PS_SINT8:
+		n := j*8
+		dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
+	_MM_UPCONV_PS_UINT16:
+		n := j*16
+		dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
+	_MM_UPCONV_PS_SINT16:
+		n := j*16
+		dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
+	ESAC
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_i64extgather_pslo" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" using "conv" to single-precision (32-bit) floating-point elements and stores them in the lower half of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the load is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_PS_NONE:
+			dst[i+31:i] := addr[i+31:i]
+		_MM_UPCONV_PS_FLOAT16:
+			n := j*16
+			dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
+		_MM_UPCONV_PS_UINT8:
+			n := j*8
+			dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
+		_MM_UPCONV_PS_SINT8:
+			n := j*8
+			dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
+		_MM_UPCONV_PS_UINT16:
+			n := j*16
+			dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
+		_MM_UPCONV_PS_SINT16:
+			n := j*16
+			dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
+		ESAC
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_i64extgather_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PD_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	CASE conv OF
+	_MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i]
+	ESAC
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512d" name="_mm512_mask_i64extgather_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="conv" type="_MM_UPCONV_PD_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" using "conv" to 64-bit floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	IF k[j]
+		CASE conv OF
+		_MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i]
+		ESAC
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_i32extscatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI32_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 16 packed 32-bit integer elements in "v1" using "conv" and stores them in memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal. AVX512 supports _MM_DOWNCONV_EPI32_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	CASE conv OF
+	_MM_DOWNCONV_EPI32_NONE:
+		addr[i+31:i] := v1[i+31:i]
+	_MM_DOWNCONV_EPI32_UINT8:
+		n := j*8
+		addr[n+7:n] := UInt32ToUInt8(v1[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT8:
+		n := j*8
+		addr[n+7:n] := SInt32ToSInt8(v1[i+31:i])
+	_MM_DOWNCONV_EPI32_UINT16:
+		n := j*16
+		addr[n+15:n] := UInt32ToUInt16(v1[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT16:
+		n := j*16
+		addr[n+15:n] := SInt32ToSInt16(v1[n+15:n])
+	ESAC
+ENDFOR
+	</operation>
+	<instruction name="vpscatterdd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512/KNC" rettype="void" name="_mm512_mask_i32extscatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F/KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI32_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 16 packed 32-bit integer elements in "v1" using "conv" and stores them in memory locations starting at location "mv" at packed 32-bit integer indices stored in "index" scaled by "scale". Elements are written using writemask "k" (elements are only written when the corresponding mask bit is set; otherwise, elements are left unchanged in memory). "hint" indicates to the processor whether the data is non-temporal. AVX512 supports _MM_DOWNCONV_EPI32_NONE.</description>
+	<operation>
+FOR j := 0 to 15
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI32_NONE:
+			addr[i+31:i] := v1[i+31:i]
+		_MM_DOWNCONV_EPI32_UINT8:
+			n := j*8
+			addr[n+7:n] := UInt32ToUInt8(v1[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT8:
+			n := j*8
+			addr[n+7:n] := SInt32ToSInt8(v1[i+31:i])
+		_MM_DOWNCONV_EPI32_UINT16:
+			n := j*16
+			addr[n+15:n] := UInt32ToUInt16(v1[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT16:
+			n := j*16
+			addr[n+15:n] := SInt32ToSInt16(v1[n+15:n])
+		ESAC
+	FI 
+ENDFOR
+	</operation>
+	<instruction name="vpscatterdd" form="m512 {k}, zmm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_i64extscatter_pslo" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 8 packed single-precision (32-bit) floating-point elements in "v1" using "conv" and stores them in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	CASE conv OF
+	_MM_DOWNCONV_PS_NONE:
+		addr[i+31:i] := v1[i+31:i]
+	_MM_DOWNCONV_PS_FLOAT16:
+		n := j*16
+		addr[n+15:n] := Float32ToFloat16(v1[i+31:i])
+	_MM_DOWNCONV_PS_UINT8:
+		n := j*8
+		addr[n+7:n] := Float32ToUInt8(v1[i+31:i])
+	_MM_DOWNCONV_PS_SINT8:
+		n := j*8
+		addr[n+7:n] := Float32ToSInt8(v1[i+31:i])
+	_MM_DOWNCONV_PS_UINT16:
+		n := j*16
+		addr[n+15:n] := Float32ToUInt16(v1[i+31:i])
+	_MM_DOWNCONV_PS_SINT16:
+		n := j*16
+		addr[n+15:n] := Float32ToSInt16(v1[i+31:i])
+	ESAC
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_i64extscatter_pslo" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PS_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 8 packed single-precision (32-bit) floating-point elements in "v1" using "conv" and stores them in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale". Elements are only written when the corresponding mask bit is set in "k"; otherwise, elements are unchanged in memory. "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_PS_NONE:
+			addr[i+31:i] := v[i+31:i]
+		_MM_DOWNCONV_PS_FLOAT16:
+			n := j*16
+			addr[n+15:n] := Float32ToFloat16(v1[i+31:i])
+		_MM_DOWNCONV_PS_UINT8:
+			n := j*8
+			addr[n+7:n] := Float32ToUInt8(v1[i+31:i])
+		_MM_DOWNCONV_PS_SINT8:
+			n := j*8
+			addr[n+7:n] := Float32ToSInt8(v1[i+31:i])
+		_MM_DOWNCONV_PS_UINT16:
+			n := j*16
+			addr[n+15:n] := Float32ToUInt16(v1[i+31:i])
+		_MM_DOWNCONV_PS_SINT16:
+			n := j*16
+			addr[n+15:n] := Float32ToSInt16(v1[i+31:i])
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_i64extscatter_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PD_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 8 packed double-precision (64-bit) floating-point elements in "v1" using "conv" and stores them in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	CASE conv OF
+	_MM_DOWNCONV_EPI64_NONE:
+		addr[i+63:i] := v1[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_i64extscatter_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512d"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_PD_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 8 packed double-precision (64-bit) floating-point elements in "v1" using "conv" and stores them in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale". Elements are written to memory using writemask "k" (elements are not stored to memory when the corresponding mask bit is not set; the memory location is left unchagned). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI64_NONE:
+			addr[i+63:i] := v1[i+63:i]
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_i64extscatter_epi32lo" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI32_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts the low 8 packed 32-bit integer elements in "v1" using "conv" and stores them in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale". "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	CASE conv OF
+	_MM_DOWNCONV_EPI32_NONE:
+		addr[i+31:i] := v1[i+31:i]
+	_MM_DOWNCONV_EPI32_UINT8:
+		n := j*8
+		addr[n+7:n] := UInt32ToUInt8(v1[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT8:
+		n := j*8
+		addr[n+7:n] := SInt32ToSInt8(v1[i+31:i])
+	_MM_DOWNCONV_EPI32_UINT16:
+		n := j*16
+		addr[n+15:n] := UInt32ToUInt16(v1[i+31:i])
+	_MM_DOWNCONV_EPI32_SINT16:
+		n := j*16
+		addr[n+15:n] := SInt32ToSInt16(v1[n+15:n])
+	ESAC
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_i64extscatter_epi32lo" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI32_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts the low 8 packed 32-bit integer elements in "v1" using "conv" and stores them in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale". Elements are written to memory using writemask "k" (elements are only written when the corresponding mask bit is set; otherwise, the memory location is left unchanged). "hint" indicates to the processor whether the data is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	IF k[j]
+		CASE conv OF
+		_MM_DOWNCONV_EPI32_NONE:
+			addr[i+31:i] := v1[i+31:i]
+		_MM_DOWNCONV_EPI32_UINT8:
+			n := j*8
+			addr[n+7:n] := UInt32ToUInt8(v1[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT8:
+			n := j*8
+			addr[n+7:n] := SInt32ToSInt8(v1[i+31:i])
+		_MM_DOWNCONV_EPI32_UINT16:
+			n := j*16
+			addr[n+15:n] := UInt32ToUInt16(v1[i+31:i])
+		_MM_DOWNCONV_EPI32_SINT16:
+			n := j*16
+			addr[n+15:n] := SInt32ToSInt16(v1[n+15:n])
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_i64extscatter_epi64" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI64_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 8 packed 64-bit integer elements in "v1" using "conv" and stores them in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale". "hint" indicates to the processor whether the load is non-temporal.</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*64
+	CASE conv OF
+	_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i]
+	ESAC
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_i64extscatter_epi64" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="conv" type="_MM_DOWNCONV_EPI64_ENUM"/>
+	<parameter varname="scale" type="int"/>
+	<parameter varname="hint" type="int"/>
+	<description>Down-converts 8 packed 64-bit integer elements in "v1" using "conv" and stores them in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory.</description>
+	<operation>
+FOR j := 0 to 7
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		i := j*64
+		CASE conv OF
+		_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i]
+		ESAC
+	FI
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mullox_epi64" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] * b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_mullox_epi64" sequence="true">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_permute4f128_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="_MM_PERM_ENUM"/>
+	<description>Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in "a" using constant "imm8". The results are stored in "dst".</description>
+	<operation>
+SELECT4(src, control) {
+	CASE control[1:0] OF
+	0: tmp[127:0] := src[127:0]
+	1: tmp[127:0] := src[255:128]
+	2: tmp[127:0] := src[383:256]
+	3: tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+FOR j := 0 to 3
+	i := j*128
+	n := j*2
+	dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermf32x4" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_permute4f128_ps">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="_MM_PERM_ENUM"/>
+	<description>Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in "a" using constant "imm8". The results are stored in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control) {
+	CASE control[1:0] OF
+	0: tmp[127:0] := src[127:0]
+	1: tmp[127:0] := src[255:128]
+	2: tmp[127:0] := src[383:256]
+	3: tmp[127:0] := src[511:384]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp[511:0] := 0
+FOR j := 0 to 4
+	i := j*128
+	n := j*2
+	tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
+ENDFOR
+FOR j := 0 to 15
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermf32x4" form="zmm {k}, m512, imm" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_sincos_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="cos_res" type="__m512d *"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in "a" and stores the results of the sine computation in "dst" and the results of the cosine computation in "cos_res".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := SIN(a[i+63:i])
+	cos_res[i+63:i] := COS(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512d" name="_mm512_mask_sincos_pd" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="cos_res" type="__m512d *"/>
+	<parameter varname="sin_src" type="__m512d"/>
+	<parameter varname="cos_src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in "a" and stores the results of the sine computation in "dst" and the results of the cosine computation in "cos_res". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SIN(a[i+63:i])
+		cos_res[i+63:i] := COS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := sin_src[i+63:i]
+		cos_res[i+63:i] := cos_src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_sincos_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="cos_res" type="__m512 *"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in "a" and stores the results of the sine computation in "dst" and the results of the cosine computation in "cos_res".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := SIN(a[i+31:i])
+	cos_res[i+31:i] := COS(a[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="SVML" rettype="__m512" name="_mm512_mask_sincos_ps" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Trigonometry</category>
+	<parameter varname="cos_res" type="__m512 *"/>
+	<parameter varname="sin_src" type="__m512"/>
+	<parameter varname="cos_src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<description>Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in "a" and stores the results of the sine computation in "dst" and the results of the cosine computation in "cos_res". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SIN(a[i+31:i])
+		cos_res[i+31:i] := COS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := sin_src[i+31:i]
+		cos_res[i+31:i] := cos_src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+cos_res[MAX:512] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_i64gather_epi32lo" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="scale" type="int"/>
+	<description>Loads 8 32-bit integer memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" to "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	dst[i+31:i] := addr[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512i" name="_mm512_mask_i64gather_epi32lo" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="scale" type="int"/>
+	<description>Loads 8 32-bit integer memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		dst[i+31:i] := addr[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_i64gather_pslo" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="scale" type="int"/>
+	<description>Loads 8 single-precision (32-bit) floating-point memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" to "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	addr := MEM[mv + index[j] * scale]
+	i := j*32
+	dst[i+31:i] := addr[i+31:i]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__m512" name="_mm512_mask_i64gather_pslo" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="mv" type="void const *"/>
+	<parameter varname="scale" type="int"/>
+	<description>Loads 8 single-precision (32-bit) floating-point memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		dst[i+31:i] := addr[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_i64scatter_pslo" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v" type="__m512"/>
+	<parameter varname="scale" type="int"/>
+	<description>Stores 8 packed single-precision (32-bit) floating-point elements in "v" in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	addr := MEM[mv + index[j] * scale]
+	addr[i+31:i] := v[i+31:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_i64scatter_pslo" sequence="true">
+	<type>Floating Point</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512"/>
+	<parameter varname="scale" type="int"/>
+	<description>Stores 8 packed single-precision (32-bit) floating-point elements in "v1" in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" using writemask "k" (elements are only written to memory when the corresponding mask bit is set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		addr[i+31:i] := v1[i+31:i]
+	FI	
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_i64scatter_epi32lo" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Stores 8 packed 32-bit integer elements in "v1" in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	addr := MEM[mv + index[j] * scale]
+	addr[i+31:i] := v1[i+31:i]
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="void" name="_mm512_mask_i64scatter_epi32lo" sequence="true">
+	<type>Integer</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Store</category>
+	<parameter varname="mv" type="void *"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="index" type="__m512i"/>
+	<parameter varname="v1" type="__m512i"/>
+	<parameter varname="scale" type="int"/>
+	<description>Stores 8 packed 32-bit integer elements in "v1" in memory locations starting at location "mv" at packed 64-bit integer indices stored in "index" scaled by "scale" using writemask "k" (elements are only written to memory when the corresponding mask bit is set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		addr := MEM[mv + index[j] * scale]
+		addr[i+31:i] := v1[i+31:i]
+	FI	
+ENDFOR
+	</operation>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kmerge2l1h">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<description>Move the high element from "k1" to the low element of "k1", and insert the low element of "k2" into the high element of "k1".</description>
+	<operation>
+tmp[7:0] := k1[15:8]
+k1[15:8] := k2[7:0]
+k1[7:0]  := tmp[7:0]
+	</operation>
+	<instruction name="kmerge2l1h" form="k, k" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="KNC" rettype="__mmask16" name="_mm512_kmerge2l1l">
+	<type>Mask</type>
+	<CPUID>KNCNI</CPUID>
+	<category>Mask</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="k2" type="__mmask16"/>
+	<description>Insert the low element of "k2" into the high element of "k1".</description>
+	<operation>
+k1[15:8] := k2[7:0]
+	</operation>
+	<instruction name="kmerge2l1l" form="k, k" xed=""/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='TRUE' rettype='float' name='_mm512_cvtss_f32'>
+	<type>Floating Point</type>
+	<CPUID>AVX512</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m512' />
+	<description>Copy the lower single-precision (32-bit) floating-point element of "a" to "dst".</description>
+	<operation>dst[31:0] := a[31:0]</operation>
+	<instruction name='movss' form='m32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='TRUE' rettype='double' name='_mm512_cvtsd_f64'>
+	<type>Floating Point</type>
+	<CPUID>AVX512</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m512d'/>
+	<description>Copy the lower double-precision (64-bit) floating-point element of "a" to "dst".</description>
+	<operation>dst[63:0] := a[63:0]</operation>
+	<instruction name='movsd' form='m64, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='TRUE' rettype='int' name='_mm512_cvtsi512_si32'>
+	<type>Integer</type>
+	<CPUID>AVX512</CPUID>
+	<category>Convert</category>
+	<parameter varname='a' type='__m512i'/>
+	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
+	<operation>
+dst[31:0] := a[31:0]
+	</operation>
+	<instruction name='movd' form='r32, xmm'/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='int' name='_mm512_4dpwssd_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='src' type='_m512i'/>
+	<parameter varname='a0' type='_m512i'/>
+	<parameter varname='a1' type='_m512i'/>
+	<parameter varname='a2' type='_m512i'/>
+	<parameter varname='a3' type='_m512i'/>
+	<parameter varname='b' type='_m128i *'/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	FOR m := 0 to 3
+		lim_base := m*32
+		i := j*32
+		tl := b[lim_base+15:lim_base]
+		tu := b[lim_base+31:lim_base+16]
+		lword := a{m}[i+15:i] * tl
+		uword := a{m}[i+31:i+16] * tu
+		dst[i+31:i] := src[i+31:i] + lword + uword
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vp4dpwssd' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='int' name='_mm512_mask_4dpwssd_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='src' type='_m512i'/>
+	<parameter varname='k' type='_mmask16'/>
+	<parameter varname='a0' type='_m512i'/>
+	<parameter varname='a1' type='_m512i'/>
+	<parameter varname='a2' type='_m512i'/>
+	<parameter varname='a3' type='_m512i'/>
+	<parameter varname='b' type='_m128i *'/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set)..</description>
+	<operation>
+FOR j := 0 to 15
+	IF mask[j]
+		FOR m := 0 to 3
+			lim_base := m*32
+			i := j*32
+			tl := b[lim_base+15:lim_base]
+			tu := b[lim_base+31:lim_base+16]
+			lword := a{m}[i+15:i] * tl
+			uword := a{m}[i+31:i+16] * tu
+			dst[i+31:i] := src[i+31:i] + lword + uword
+		ENDFOR
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vp4dpwssd' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='int' name='_mm512_maskz_4dpwssd_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='k' type='_mmask16'/>
+	<parameter varname='src' type='_m512i'/>
+	<parameter varname='a0' type='_m512i'/>
+	<parameter varname='a1' type='_m512i'/>
+	<parameter varname='a2' type='_m512i'/>
+	<parameter varname='a3' type='_m512i'/>
+	<parameter varname='b' type='_m128i *'/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	IF mask[j]
+		FOR m := 0 to 3
+			lim_base := m*32
+			i := j*32
+			tl := b[lim_base+15:lim_base]
+			tu := b[lim_base+31:lim_base+16]
+			lword := a{m}[i+15:i] * tl
+			uword := a{m}[i+31:i+16] * tu
+			dst[i+31:i] := src[i+31:i] + lword + uword
+		ENDFOR
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vp4dpwssd' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='int' name='_mm512_4dpwssds_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='src' type='_m512i'/>
+	<parameter varname='a0' type='_m512i'/>
+	<parameter varname='a1' type='_m512i'/>
+	<parameter varname='a2' type='_m512i'/>
+	<parameter varname='a3' type='_m512i'/>
+	<parameter varname='b' type='_m128i *'/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation and signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	FOR m := 0 to 3
+		lim_base := m*32
+		i := j*32
+		tl := b[lim_base+15:lim_base]
+		tu := b[lim_base+31:lim_base+16]
+		lword := a{m}[i+15:i] * tl
+		uword := a{m}[i+31:i+16] * tu
+		dst[i+31:i] := SIGNED_DWORD_SATURATE(src[i+31:i] + lword + uword)
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vp4dpwssds' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='int' name='_mm512_mask_4dpwssds_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='src' type='_m512i'/>
+	<parameter varname='k' type='_mmask16'/>
+	<parameter varname='a0' type='_m512i'/>
+	<parameter varname='a1' type='_m512i'/>
+	<parameter varname='a2' type='_m512i'/>
+	<parameter varname='a3' type='_m512i'/>
+	<parameter varname='b' type='_m128i *'/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask and signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set)..</description>
+	<operation>
+FOR j := 0 to 15
+	IF mask[i]
+		FOR m := 0 to 3
+			lim_base := m*32
+			i := j*32
+			tl := b[lim_base+15:lim_base]
+			tu := b[lim_base+31:lim_base+16]
+			lword := a{m}[i+15:i] * tl
+			uword := a{m}[i+31:i+16] * tu
+			dst[i+31:i] := SIGNED_DWORD_SATURATE(src[i+31:i] + lword + uword)
+		ENDFOR
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vp4dpwssds' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='int' name='_mm512_maskz_4dpwssds_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX512_4VNNIW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='src' type='_m512i'/>
+	<parameter varname='k' type='_mmask16'/>
+	<parameter varname='a0' type='_m512i'/>
+	<parameter varname='a1' type='_m512i'/>
+	<parameter varname='a2' type='_m512i'/>
+	<parameter varname='a3' type='_m512i'/>
+	<parameter varname='b' type='_m128i *'/>
+	<description>Compute 4 sequential operand source-block dot-products of two signed 16-bit element operands with 32-bit element accumulation with mask and signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set)..</description>
+	<operation>
+FOR j := 0 to 15
+	IF mask[i]
+		FOR m := 0 to 3
+			lim_base := m*32
+			i := j*32
+			tl := b[lim_base+15:lim_base]
+			tu := b[lim_base+31:lim_base+16]
+			lword := a{m}[i+15:i] * tl
+			uword := a{m}[i+31:i+16] * tu
+			dst[i+31:i] := SIGNED_DWORD_SATURATE(src[i+31:i] + lword + uword)
+		ENDFOR
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vp4dpwssds' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512' name='_mm512_4fmadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS </CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='_m512'/>
+	<parameter varname='b0' type='_m512i'/>
+	<parameter varname='b1' type='_m512i'/>
+	<parameter varname='b2' type='_m512i'/>
+	<parameter varname='b3' type='_m512i'/>
+	<parameter varname='c' type='_m128i *'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by the 4 corresponding packed elements in "c", accumulating with the corresponding elements in "a". Store the results in "dst".</description>
+	<operation>
+dst := a
+FOR m := 0 to 3
+	FOR j := 0 to 15
+		i = j*32
+		n = m*32
+		dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] + b{m}[i+31:i] * c[n+31:n])
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0 
+	</operation>
+	<instruction name='v4fmaddps' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512' name='_mm512_mask_4fmadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='_m512'/>
+	<parameter varname='k' type='_mmask16'/>
+	<parameter varname='b0' type='_m512i'/>
+	<parameter varname='b1' type='_m512i'/>
+	<parameter varname='b2' type='_m512i'/>
+	<parameter varname='b3' type='_m512i'/>
+	<parameter varname='c' type='_m128i *'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by the 4 corresponding packed elements in "c", accumulating with the corresponding elements in "a". Store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+dst := a
+FOR m := 0 to 3
+	FOR j := 0 to 15
+		i = j*32
+		n = m*32
+		IF mask[j]
+			dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] + b{m}[i+31:i] * c[n+31:n])
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0 
+	</operation>
+	<instruction name='v4fmaddps' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512' name='_mm512_maskz_4fmadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='_m512'/>
+	<parameter varname='k' type='_mmask16'/>
+	<parameter varname='b0' type='_m512i'/>
+	<parameter varname='b1' type='_m512i'/>
+	<parameter varname='b2' type='_m512i'/>
+	<parameter varname='b3' type='_m512i'/>
+	<parameter varname='c' type='_m128i *'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by the 4 corresponding packed elements in "c", accumulating with the corresponding elements in "a". Store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+dst := a
+FOR m := 0 to 3
+	FOR j := 0 to 15
+		i = j*32
+		n = m*32
+		IF mask[j]
+			dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] + b{m}[i+31:i] * c[n+31:n])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0 
+	</operation>
+	<instruction name='v4fmaddps' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512' name='_mm512_4fnmadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='_m512'/>
+	<parameter varname='b0' type='_m512i'/>
+	<parameter varname='b1' type='_m512i'/>
+	<parameter varname='b2' type='_m512i'/>
+	<parameter varname='b3' type='_m512i'/>
+	<parameter varname='c' type='_m128i *'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by the 4 corresponding packed elements in "c", accumulating the negated intermediate result with the corresponding elements in "a". Store the results in "dst".</description>
+	<operation>
+dst := a
+FOR m := 0 to 3
+	FOR j := 0 to 15
+		i = j*32
+		n = m*32
+		dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] - b{m}[i+31:i] * c[n+31:n])
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0 
+	</operation>
+	<instruction name='v4fnmaddps' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512' name='_mm512_mask_4fnmadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='_m512'/>
+	<parameter varname='k' type='_mmask16'/>
+	<parameter varname='b0' type='_m512i'/>
+	<parameter varname='b1' type='_m512i'/>
+	<parameter varname='b2' type='_m512i'/>
+	<parameter varname='b3' type='_m512i'/>
+	<parameter varname='c' type='_m128i *'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by the 4 corresponding packed elements in "c", accumulating the negated intermediate result with the corresponding elements in "a". Store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+dst := a
+FOR m := 0 to 3
+	FOR j := 0 to 15
+		i = j*32
+		n = m*32
+		IF mask[j]
+			dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] - b{m}[i+31:i] * c[n+31:n])
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0 
+	</operation>
+	<instruction name='v4fnmaddps' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512' name='_mm512_maskz_4fnmadd_ps'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname='a' type='_m512'/>
+	<parameter varname='k' type='_mmask16'/>
+	<parameter varname='b0' type='_m512i'/>
+	<parameter varname='b1' type='_m512i'/>
+	<parameter varname='b2' type='_m512i'/>
+	<parameter varname='b3' type='_m512i'/>
+	<parameter varname='c' type='_m128i *'/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by the 4 corresponding packed elements in "c", accumulating the negated intermediate result with the corresponding elements in "a". Store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+dst := a
+FOR m := 0 to 3
+	FOR j := 0 to 15
+		i = j*32
+		n = m*32
+		IF mask[j]
+			dst[i+31:i] := RoundFPControl_MXCSR(dst[i+31:i] - b{m}[i+31:i] * c[n+31:n])
+		ELSE
+			dst[i+31:i] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0 
+	</operation>
+	<instruction name='v4fnmaddps' form='zmm {k}, zmm+3, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m128' name='_mm_4fmadd_ss'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b0" type="__m128"/>
+	<parameter varname="b1" type="__m128"/>
+	<parameter varname="b2" type="__m128"/>
+	<parameter varname="b3" type="__m128"/>
+	<parameter varname="c" type="__m128 *"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by corresponding element in "c", accumulating  with the lower element in "a". Store the result in the lower element of "dst".</description>
+	<operation>
+dst := a
+FOR j := 0 to 3
+	i := j*32
+	dst[31:0] := RoundFPControl_MXCSR(dst[31:0] + b{j}[31:0] * c[i+31:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name='v4fmaddss' form='xmm {k}, xmm, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m128' name='_mm_mask_4fmadd_ss'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b0" type="__m128"/>
+	<parameter varname="b1" type="__m128"/>
+	<parameter varname="b2" type="__m128"/>
+	<parameter varname="b3" type="__m128"/>
+	<parameter varname="c" type="__m128 *"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by corresponding element in "c", accumulating  with the lower element in "a". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set).</description>
+	<operation>
+dst := a
+IF k[0]
+	FOR j := 0 to 3
+		i := j*32
+		dst[31:0] := RoundFPControl_MXCSR(dst[31:0] + b{j}[31:0] * c[i+31:i])
+	ENDFOR
+FI
+dst[MAX:32] := 0
+	</operation>
+	<instruction name='v4fmaddss' form='xmm {k}, xmm, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m128' name='_mm_maskz_4fmadd_ss'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b0" type="__m128"/>
+	<parameter varname="b1" type="__m128"/>
+	<parameter varname="b2" type="__m128"/>
+	<parameter varname="b3" type="__m128"/>
+	<parameter varname="c" type="__m128 *"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by corresponding element in "c", accumulating  with the lower element in "a". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set).</description>
+	<operation>
+dst := a
+IF k[0]
+	FOR j := 0 to 3
+		i := j*32
+		dst[31:0] := RoundFPControl_MXCSR(dst[31:0] + b{j}[31:0] * c[i+31:i])
+	ENDFOR
+ELSE
+	dst[31:0] := 0
+FI
+dst[MAX:32] := 0
+	</operation>
+	<instruction name='v4fmaddss' form='xmm {k}, xmm, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m128' name='_mm_4fnmadd_ss'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b0" type="__m128"/>
+	<parameter varname="b1" type="__m128"/>
+	<parameter varname="b2" type="__m128"/>
+	<parameter varname="b3" type="__m128"/>
+	<parameter varname="c" type="__m128 *"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by corresponding element in "c", accumulating  the negated intermediate result with the lower element in "a". Store the result in the lower element of "dst".</description>
+	<operation>
+dst := a
+FOR j := 0 to 3
+	i := j*32
+	dst[31:0] := RoundFPControl_MXCSR(dst[31:0] - b{j}[31:0] * c[i+31:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name='v4fnmaddss' form='xmm {k}, xmm, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m128' name='_mm_mask_4fnmadd_ss'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b0" type="__m128"/>
+	<parameter varname="b1" type="__m128"/>
+	<parameter varname="b2" type="__m128"/>
+	<parameter varname="b3" type="__m128"/>
+	<parameter varname="c" type="__m128 *"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by corresponding element in "c", accumulating  the negated intermediate result with the lower element in "a". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set).</description>
+	<operation>
+dst := a
+IF k[0]
+	FOR j := 0 to 3
+		i := j*32
+		dst[31:0] := RoundFPControl_MXCSR(dst[31:0] - b{j}[31:0] * c[i+31:i])
+	ENDFOR
+FI
+dst[MAX:32] := 0
+	</operation>
+	<instruction name='v4fnmaddss' form='xmm {k}, xmm, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m128' name='_mm_maskz_4fnmadd_ss'>
+	<type>Floating Point</type>
+	<CPUID>AVX512_4FMAPS</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b0" type="__m128"/>
+	<parameter varname="b1" type="__m128"/>
+	<parameter varname="b2" type="__m128"/>
+	<parameter varname="b3" type="__m128"/>
+	<parameter varname="c" type="__m128 *"/>
+	<description>Multiply the lower single-precision (32-bit) floating-point elements specified in 4 consecutive operands "b0" through "b3" by corresponding element in "c", accumulating  the negated intermediate result with the lower element in "a". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set).</description>
+	<operation>
+dst := a
+IF k[0]
+	FOR j := 0 to 3
+		i := j*32
+		dst[31:0] := RoundFPControl_MXCSR(dst[31:0] - b{j}[31:0] * c[i+31:i])
+	ENDFOR
+ELSE
+	dst[31:0] := 0
+FI
+dst[MAX:32] := 0
+	</operation>
+	<instruction name='v4fnmaddss' form='xmm {k}, xmm, m128' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512i' name='_mm512_popcnt_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpopcntd' form='zmm {k}, zmm' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512i' name='_mm512_mask_popcnt_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpopcntd' form='zmm {k}, zmm' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512i' name='_mm512_maskz_popcnt_epi32'>
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := POPCNT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpopcntd' form='zmm {k}, zmm' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512i' name='_mm512_popcnt_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpopcntq' form='zmm {k}, zmm' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512i' name='_mm512_mask_popcnt_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpopcntq' form='zmm {k}, zmm' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech='AVX-512' vexEq='FALSE' rettype='__m512i' name='_mm512_maskz_popcnt_epi64'>
+	<type>Integer</type>
+	<CPUID>AVX512VPOPCNTDQ</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := POPCNT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name='vpopcntq' form='zmm {k}, zmm' xed=''/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_kunpackd">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__mmask64"/>
+	<parameter varname="b" type="__mmask64"/>
+	<description>Unpack and interleave 32 bits from masks "a" and "b", and store the 64-bit result in "k".</description>
+	<operation>
+k[31:0] := a[31:0]
+k[63:32] := b[31:0]
+k[MAX:64] := 0
+	</operation>
+	<instruction name="kunpckdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_kunpackw">
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__mmask32"/>
+	<parameter varname="b" type="__mmask32"/>
+	<description>Unpack and interleave 16 bits from masks "a" and "b", and store the 32-bit result in "k".</description>
+	<operation>
+k[15:0] := a[15:0]
+k[31:16] := b[15:0]
+k[MAX:32] := 0
+	</operation>
+	<instruction name="kunpckwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vaddpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vaddpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vaddpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_add_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vaddpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vaddps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vaddps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vaddps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_add_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vaddps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "count" 32-bit elements, and store the low 32 bytes (8 elements) in "dst".</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (32*count)
+dst[255:0] := temp[255:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="valignd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "count" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (32*count)
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="valignd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "count" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (32*count)
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="valignd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "count" 32-bit elements, and store the low 16 bytes (4 elements) in "dst".</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (32*count)
+dst[127:0] := temp[127:0]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="valignd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "count" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (32*count)
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="valignd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_alignr_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "count" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (32*count)
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := temp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="valignd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "count" 64-bit elements, and store the low 32 bytes (4 elements) in "dst".</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (64*count)
+dst[255:0] := temp[255:0]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="valignq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "count" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (64*count)
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="valignq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "count" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[511:256] := a[255:0]
+temp[255:0] := b[255:0]
+temp[511:0] := temp[511:0] &gt;&gt; (64*count)
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="valignq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "count" 64-bit elements, and store the low 16 bytes (2 elements) in "dst".</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (64*count)
+dst[127:0] := temp[127:0]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="valignq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "count" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (64*count)
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="valignq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_alignr_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "count" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+temp[255:128] := a[127:0]
+temp[127:0] := b[127:0]
+temp[255:0] := temp[255:0] &gt;&gt; (64*count)
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := temp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="valignq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vandnpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vandnpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandnpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandnpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandnpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vandnpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_andnot_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vandnpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vandnps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vandnps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandnps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandnps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandnps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vandnps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_andnot_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vandnps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0 
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_and_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_and_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_blend_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vblendmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_blend_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vblendmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_blend_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vblendmps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_blend_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vblendmps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastf32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastf32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastf32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcastf32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcastf32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_broadcast_f32x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcastf32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastf32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastf32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_broadcast_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastf32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_broadcast_f32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<description>Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 8)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcastf32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_broadcast_f32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcastf32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_broadcast_f32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcastf32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j mod 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j mod 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[n+63:n]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j mod 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcastf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[n+63:n]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcastf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_broadcast_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcastf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcasti32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[n+31:n]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcasti32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcasti32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_broadcast_i32x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcasti32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_broadcast_i32x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[n+31:n]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcasti32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_broadcast_i32x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcasti32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	n := (j mod 2)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vbroadcasti32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	n := (j mod 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[n+31:n]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vbroadcasti32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_broadcast_i32x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	n := (j mod 2)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vbroadcasti32x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_broadcast_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 4)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcasti32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_broadcast_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[n+31:n]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcasti32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_broadcast_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	n := (j mod 4)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcasti32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_broadcast_i32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 8)*32
+	dst[i+31:i] := a[n+31:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcasti32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_broadcast_i32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := src[n+31:n]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcasti32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_broadcast_i32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	n := (j mod 8)*32
+	IF k[j]
+		dst[i+31:i] := a[n+31:n]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcasti32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_broadcast_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j mod 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcasti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_broadcast_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j mod 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[n+63:n]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcasti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_broadcast_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	n := (j mod 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcasti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_broadcast_i64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 2)*64
+	dst[i+63:i] := a[n+63:n]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcasti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_broadcast_i64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := src[n+63:n]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcasti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_broadcast_i64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	n := (j mod 2)*64
+	IF k[j]
+		dst[i+63:i] := a[n+63:n]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vbroadcasti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_broadcastsd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastss"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vbroadcastss"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vbroadcastss"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_broadcastss_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vbroadcastss"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vcmppd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vcmppd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vcmppd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vcmppd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vcmpps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vcmpps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vcmpps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _CMP_EQ_OQ
+1: OP := _CMP_LT_OS
+2: OP := _CMP_LE_OS
+3: OP := _CMP_UNORD_Q 
+4: OP := _CMP_NEQ_UQ
+5: OP := _CMP_NLT_US
+6: OP := _CMP_NLE_US
+7: OP := _CMP_ORD_Q
+8: OP := _CMP_EQ_UQ
+9: OP := _CMP_NGE_US
+10: OP := _CMP_NGT_US
+11: OP := _CMP_FALSE_OQ
+12: OP := _CMP_NEQ_OQ
+13: OP := _CMP_GE_OS
+14: OP := _CMP_GT_OS
+15: OP := _CMP_TRUE_UQ
+16: OP := _CMP_EQ_OS
+17: OP := _CMP_LT_OQ
+18: OP := _CMP_LE_OQ
+19: OP := _CMP_UNORD_S
+20: OP := _CMP_NEQ_US
+21: OP := _CMP_NLT_UQ
+22: OP := _CMP_NLE_UQ
+23: OP := _CMP_ORD_S
+24: OP := _CMP_EQ_US
+25: OP := _CMP_NGE_UQ 
+26: OP := _CMP_NGT_UQ 
+27: OP := _CMP_FALSE_OS 
+28: OP := _CMP_NEQ_OS 
+29: OP := _CMP_GE_OQ
+30: OP := _CMP_GT_OQ
+31: OP := _CMP_TRUE_US
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vcmpps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcompresspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_compressstoreu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vcompresspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcompresspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcompresspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_compressstoreu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vcompresspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_compress_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcompresspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcompressps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_compressstoreu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vcompressps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcompressps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcompressps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_compressstoreu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vcompressps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_compress_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcompressps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := src[m+63:m]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtdq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtdq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := src[m+63:m]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtdq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_cvtepi32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+	ELSE
+		dst[m+63:m] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtdq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtdq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtdq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtdq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_cvtepi32_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtdq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtpd2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtpd2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_mask_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_maskz_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtpd2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_cvtpd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtpd2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtph2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtph2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtph2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_cvtph_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*16
+	IF k[j]
+		dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtph2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtps2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtps2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2ph"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2ph"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2ph"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2ph"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtps2ph"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtps2ph"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvt_roundps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtps2ph"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtps_ph">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := 16*j
+	l := 32*j
+	IF k[j]
+		dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtps2ph"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	 [round_note]
+	 </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_cvt_roundepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_cvt_roundepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_cvt_roundepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_cvtepi64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_mask_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_maskz_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_cvt_roundepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_mask_cvt_roundepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_mask_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_maskz_cvt_roundepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_maskz_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_cvtepi64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttpd2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttpd2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvttpd2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvttpd_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvttpd2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtt_roundpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvttpd_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttpd2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	k := 64*j
+	dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvttpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvttpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvttpd_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 32*j
+	l := 64*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvttpd2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtt_roundpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvttpd_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttpd2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttps2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*i
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttps2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttps2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvttps_epi32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*i
+	IF k[j]
+		dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttps2dq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtt_roundps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvttps_epi64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttps2qq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvttps_epu32">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttps2udq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtt_roundps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="sae" type="int"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to "sae" to suppress all exceptions.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvttps_epu64">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvttps2uqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtudq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtudq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtudq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtudq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtudq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_cvtepu32_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtudq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_cvt_roundepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_cvt_roundepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_cvt_roundepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_cvtepu64_pd">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtuqq2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_mask_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_maskz_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_cvt_roundepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_mask_cvt_roundepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_mask_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_maskz_cvt_roundepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_maskz_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_cvtepu64_ps">
+	<type>Floating Point</type>
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vcvtuqq2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
+	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
+	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
+	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
+ENDFOR
+
+FOR j := 0 to 3
+	i := j*64
+	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vdbpsadbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
+	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
+	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
+	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
+ENDFOR
+
+FOR j := 0 to 3
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vdbpsadbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
+	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
+	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
+	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
+ENDFOR
+
+FOR j := 0 to 3
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vdbpsadbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
+	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
+	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
+	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
+ENDFOR
+
+FOR j := 0 to 7
+	i := j*64
+	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vdbpsadbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
+	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
+	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
+	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
+ENDFOR
+
+FOR j := 0 to 7
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vdbpsadbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
+	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
+	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
+	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
+ENDFOR
+
+FOR j := 0 to 7
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vdbpsadbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+	</description>
+	<operation>
+tmp[31:0] := select(b[127:0], imm8[1:0])
+tmp[63:32] := select(b[127:0], imm8[3:2])
+tmp[95:64] := select(b[127:0], imm8[5:4])
+tmp[127:96] := select(b[127:0], imm8[7:6])
+
+FOR j := 0 to 1
+	i := j*64
+	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vdbpsadbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+	</description>
+	<operation>
+tmp[31:0] := select(b[127:0], imm8[1:0])
+tmp[63:32] := select(b[127:0], imm8[3:2])
+tmp[95:64] := select(b[127:0], imm8[5:4])
+tmp[127:96] := select(b[127:0], imm8[7:6])
+
+FOR j := 0 to 1
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vdbpsadbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_dbsad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
+	</description>
+	<operation>
+tmp[31:0] := select(b[127:0], imm8[1:0])
+tmp[63:32] := select(b[127:0], imm8[3:2])
+tmp[95:64] := select(b[127:0], imm8[5:4])
+tmp[127:96] := select(b[127:0], imm8[7:6])
+
+FOR j := 0 to 1
+	i := j*64
+	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
+	
+	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
+	
+	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
+	
+	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
+ENDFOR
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vdbpsadbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vdivpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vdivpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vdivpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_div_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] / b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vdivpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vdivps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vdivps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vdivps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_div_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] / b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vdivps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vexpandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vexpandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vexpandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vexpandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vexpandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vexpandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_expand_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vexpandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_expandloadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vexpandpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vexpandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vexpandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vexpandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vexpandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vexpandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vexpandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_expand_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vexpandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_expandloadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vexpandps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextractf32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_mask_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextractf32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_maskz_extractf32x4_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextractf32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_extractf32x8_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vextractf32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_mask_extractf32x8_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vextractf32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm512_maskz_extractf32x8_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vextractf32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm256_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextractf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm256_mask_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextractf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm256_maskz_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextractf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm512_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextractf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm512_mask_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextractf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm512_maskz_extractf64x2_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextractf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextracti32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextracti32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_extracti32x4_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextracti32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_extracti32x8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[255:0] := a[255:0]
+1: dst[255:0] := a[511:256]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vextracti32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_extracti32x8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vextracti32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_extracti32x8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[255:0] := a[255:0]
+1: tmp[255:0] := a[511:256]
+ESAC
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vextracti32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextracti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextracti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+ESAC
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextracti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst".</description>
+	<operation>
+CASE imm8[7:0] of
+0: dst[127:0] := a[127:0]
+1: dst[127:0] := a[255:128]
+2: dst[127:0] := a[383:256]
+3: dst[127:0] := a[511:384]
+ESAC
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextracti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_mask_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextracti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm512_maskz_extracti64x2_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+CASE imm8[7:0] of
+0: tmp[127:0] := a[127:0]
+1: tmp[127:0] := a[255:128]
+2: tmp[127:0] := a[383:256]
+3: tmp[127:0] := a[511:384]
+ESAC
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vextracti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfixupimmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfixupimmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfixupimmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfixupimmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.	</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfixupimmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fixupimm_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN := 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
+	tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
+	CASE(tsrc[63:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[63:0] := src1[63:0]
+	1 : dest[63:0] := tsrc[63:0]
+	2 : dest[63:0] := QNaN(tsrc[63:0])
+	3 : dest[63:0] := QNAN_Indefinite
+	4 : dest[63:0] := -INF
+	5 : dest[63:0] := +INF
+	6 : dest[63:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[63:0] := -0
+	8 : dest[63:0] := +0
+	9 : dest[63:0] := -1
+	10: dest[63:0] := +1
+	11: dest[63:0] := 1&#x2044;2
+	12: dest[63:0] := 90.0
+	13: dest[63:0] := PI/2
+	14: dest[63:0] := MAX_FLOAT
+	15: dest[63:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[63:0]
+}
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfixupimmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfixupimmps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfixupimmps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfixupimmps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfixupimmps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfixupimmps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fixupimm_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting.</description>
+	<operation>
+enum TOKEN_TYPE {
+	QNAN_TOKEN := 0, 
+	SNAN_TOKEN L= 1, 
+	ZERO_VALUE_TOKEN := 2, 
+	ONE_VALUE_TOKEN := 3, 
+	NEG_INF_TOKEN := 4, 
+	POS_INF_TOKEN := 5, 
+	NEG_VALUE_TOKEN := 6, 
+	POS_VALUE_TOKEN := 7
+}
+FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
+	tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	QNAN_TOKEN:j := 0
+	SNAN_TOKEN:j := 1
+	ZERO_VALUE_TOKEN: j := 2
+	ONE_VALUE_TOKEN: j := 3
+	NEG_INF_TOKEN: j := 4
+	POS_INF_TOKEN: j := 5
+	NEG_VALUE_TOKEN: j := 6
+	POS_VALUE_TOKEN: j := 7
+	ESAC
+	
+	token_response[3:0] := src3[3+4*j:4*j]
+	
+	CASE(token_response[3:0]) of
+	0 : dest[31:0] := src1[31:0]
+	1 : dest[31:0] := tsrc[31:0]
+	2 : dest[31:0] := QNaN(tsrc[31:0])
+	3 : dest[31:0] := QNAN_Indefinite
+	4 : dest[31:0] := -INF
+	5 : dest[31:0] := +INF
+	6 : dest[31:0] := tsrc.sign? &#x2013;INF : +INF
+	7 : dest[31:0] := -0
+	8 : dest[31:0] := +0
+	9 : dest[31:0] := -1
+	10: dest[31:0] := +1
+	11: dest[31:0] := 1&#x2044;2
+	12: dest[31:0] := 90.0
+	13: dest[31:0] := PI/2
+	14: dest[31:0] := MAX_FLOAT
+	15: dest[31:0] := -MAX_FLOAT
+	ESAC
+	
+	CASE(tsrc[31:0] of TOKEN_TYPE)
+	ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
+	ZERO_VALUE_TOKEN: if imm8[1] then set #IE
+	ONE_VALUE_TOKEN: if imm8[2] then set #ZE
+	ONE_VALUE_TOKEN: if imm8[3] then set #IE
+	SNAN_TOKEN: if imm8[4] then set #IE
+	NEG_INF_TOKEN: if imm8[5] then set #IE
+	NEG_VALUE_TOKEN: if imm8[6] then set #IE
+	POS_INF_TOKEN: if imm8[7] then set #IE
+	ESAC
+	RETURN dest[31:0]
+}
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfixupimmps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask3_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmadd132pd"/>
+	<instruction name="vfmadd213pd"/>
+	<instruction name="vfmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmadd132pd"/>
+	<instruction name="vfmadd213pd"/>
+	<instruction name="vfmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmadd132pd"/>
+	<instruction name="vfmadd213pd"/>
+	<instruction name="vfmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmadd132pd"/>
+	<instruction name="vfmadd213pd"/>
+	<instruction name="vfmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmadd132pd"/>
+	<instruction name="vfmadd213pd"/>
+	<instruction name="vfmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmadd132pd"/>
+	<instruction name="vfmadd213pd"/>
+	<instruction name="vfmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask3_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmadd132ps"/>
+	<instruction name="vfmadd213ps"/>
+	<instruction name="vfmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmadd132ps"/>
+	<instruction name="vfmadd213ps"/>
+	<instruction name="vfmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmadd132ps"/>
+	<instruction name="vfmadd213ps"/>
+	<instruction name="vfmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmadd132ps"/>
+	<instruction name="vfmadd213ps"/>
+	<instruction name="vfmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmadd132ps"/>
+	<instruction name="vfmadd213ps"/>
+	<instruction name="vfmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmadd132ps"/>
+	<instruction name="vfmadd213ps"/>
+	<instruction name="vfmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask3_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmaddsub132pd"/>
+	<instruction name="vfmaddsub213pd"/>
+	<instruction name="vfmaddsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmaddsub132pd"/>
+	<instruction name="vfmaddsub213pd"/>
+	<instruction name="vfmaddsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmaddsub132pd"/>
+	<instruction name="vfmaddsub213pd"/>
+	<instruction name="vfmaddsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmaddsub132pd"/>
+	<instruction name="vfmaddsub213pd"/>
+	<instruction name="vfmaddsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmaddsub132pd"/>
+	<instruction name="vfmaddsub213pd"/>
+	<instruction name="vfmaddsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fmaddsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmaddsub132pd"/>
+	<instruction name="vfmaddsub213pd"/>
+	<instruction name="vfmaddsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask3_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmaddsub132ps"/>
+	<instruction name="vfmaddsub213ps"/>
+	<instruction name="vfmaddsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmaddsub132ps"/>
+	<instruction name="vfmaddsub213ps"/>
+	<instruction name="vfmaddsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmaddsub132ps"/>
+	<instruction name="vfmaddsub213ps"/>
+	<instruction name="vfmaddsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmaddsub132ps"/>
+	<instruction name="vfmaddsub213ps"/>
+	<instruction name="vfmaddsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmaddsub132ps"/>
+	<instruction name="vfmaddsub213ps"/>
+	<instruction name="vfmaddsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fmaddsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmaddsub132ps"/>
+	<instruction name="vfmaddsub213ps"/>
+	<instruction name="vfmaddsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask3_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsub132pd"/>
+	<instruction name="vfmsub213pd"/>
+	<instruction name="vfmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsub132pd"/>
+	<instruction name="vfmsub213pd"/>
+	<instruction name="vfmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsub132pd"/>
+	<instruction name="vfmsub213pd"/>
+	<instruction name="vfmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsub132pd"/>
+	<instruction name="vfmsub213pd"/>
+	<instruction name="vfmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsub132pd"/>
+	<instruction name="vfmsub213pd"/>
+	<instruction name="vfmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsub132pd"/>
+	<instruction name="vfmsub213pd"/>
+	<instruction name="vfmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask3_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsub132ps"/>
+	<instruction name="vfmsub213ps"/>
+	<instruction name="vfmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsub132ps"/>
+	<instruction name="vfmsub213ps"/>
+	<instruction name="vfmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsub132ps"/>
+	<instruction name="vfmsub213ps"/>
+	<instruction name="vfmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsub132ps"/>
+	<instruction name="vfmsub213ps"/>
+	<instruction name="vfmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsub132ps"/>
+	<instruction name="vfmsub213ps"/>
+	<instruction name="vfmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsub132ps"/>
+	<instruction name="vfmsub213ps"/>
+	<instruction name="vfmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask3_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsubadd132pd"/>
+	<instruction name="vfmsubadd213pd"/>
+	<instruction name="vfmsubadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsubadd132pd"/>
+	<instruction name="vfmsubadd213pd"/>
+	<instruction name="vfmsubadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsubadd132pd"/>
+	<instruction name="vfmsubadd213pd"/>
+	<instruction name="vfmsubadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsubadd132pd"/>
+	<instruction name="vfmsubadd213pd"/>
+	<instruction name="vfmsubadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1 
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsubadd132pd"/>
+	<instruction name="vfmsubadd213pd"/>
+	<instruction name="vfmsubadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fmsubadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF (j is even) 
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
+		ELSE
+			dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsubadd132pd"/>
+	<instruction name="vfmsubadd213pd"/>
+	<instruction name="vfmsubadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask3_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsubadd132ps"/>
+	<instruction name="vfmsubadd213ps"/>
+	<instruction name="vfmsubadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsubadd132ps"/>
+	<instruction name="vfmsubadd213ps"/>
+	<instruction name="vfmsubadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfmsubadd132ps"/>
+	<instruction name="vfmsubadd213ps"/>
+	<instruction name="vfmsubadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsubadd132ps"/>
+	<instruction name="vfmsubadd213ps"/>
+	<instruction name="vfmsubadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsubadd132ps"/>
+	<instruction name="vfmsubadd213ps"/>
+	<instruction name="vfmsubadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fmsubadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF (j is even) 
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
+		ELSE
+			dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfmsubadd132ps"/>
+	<instruction name="vfmsubadd213ps"/>
+	<instruction name="vfmsubadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask3_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmadd132pd"/>
+	<instruction name="vfnmadd213pd"/>
+	<instruction name="vfnmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmadd132pd"/>
+	<instruction name="vfnmadd213pd"/>
+	<instruction name="vfnmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmadd132pd"/>
+	<instruction name="vfnmadd213pd"/>
+	<instruction name="vfnmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmadd132pd"/>
+	<instruction name="vfnmadd213pd"/>
+	<instruction name="vfnmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmadd132pd"/>
+	<instruction name="vfnmadd213pd"/>
+	<instruction name="vfnmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fnmadd_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmadd132pd"/>
+	<instruction name="vfnmadd213pd"/>
+	<instruction name="vfnmadd231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask3_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmadd132ps"/>
+	<instruction name="vfnmadd213ps"/>
+	<instruction name="vfnmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmadd132ps"/>
+	<instruction name="vfnmadd213ps"/>
+	<instruction name="vfnmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmadd132ps"/>
+	<instruction name="vfnmadd213ps"/>
+	<instruction name="vfnmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmadd132ps"/>
+	<instruction name="vfnmadd213ps"/>
+	<instruction name="vfnmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmadd132ps"/>
+	<instruction name="vfnmadd213ps"/>
+	<instruction name="vfnmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fnmadd_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmadd132ps"/>
+	<instruction name="vfnmadd213ps"/>
+	<instruction name="vfnmadd231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask3_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmsub132pd"/>
+	<instruction name="vfnmsub213pd"/>
+	<instruction name="vfnmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmsub132pd"/>
+	<instruction name="vfnmsub213pd"/>
+	<instruction name="vfnmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="c" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmsub132pd"/>
+	<instruction name="vfnmsub213pd"/>
+	<instruction name="vfnmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask3_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := c[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmsub132pd"/>
+	<instruction name="vfnmsub213pd"/>
+	<instruction name="vfnmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmsub132pd"/>
+	<instruction name="vfnmsub213pd"/>
+	<instruction name="vfnmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_fnmsub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="c" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmsub132pd"/>
+	<instruction name="vfnmsub213pd"/>
+	<instruction name="vfnmsub231pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask3_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmsub132ps"/>
+	<instruction name="vfnmsub213ps"/>
+	<instruction name="vfnmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmsub132ps"/>
+	<instruction name="vfnmsub213ps"/>
+	<instruction name="vfnmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="c" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vfnmsub132ps"/>
+	<instruction name="vfnmsub213ps"/>
+	<instruction name="vfnmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask3_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := c[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmsub132ps"/>
+	<instruction name="vfnmsub213ps"/>
+	<instruction name="vfnmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).  </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmsub132ps"/>
+	<instruction name="vfnmsub213ps"/>
+	<instruction name="vfnmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_fnmsub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="c" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vfnmsub132ps"/>
+	<instruction name="vfnmsub213ps"/>
+	<instruction name="vfnmsub231ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vfpclasspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vfpclasspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vfpclasspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_mask_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k1[j]
+		k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vfpclasspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vfpclasspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_fpclass_pd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vfpclasspd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vfpclassps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vfpclassps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vfpclassps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_mask_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k1[j]
+		k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vfpclassps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k".
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vfpclassps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_fpclass_ps_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).
+	[fpclass_note]
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vfpclassps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_fpclass_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k".
+	[fpclass_note]
+	</description>
+	<operation>
+k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
+k[MAX:1] := 0
+	</operation>
+	<instruction name="vfpclasssd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_fpclass_sd_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).
+	[fpclass_note]
+	</description>
+	<operation>
+IF k1[0]
+	k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name="vfpclasssd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_fpclass_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k.
+	[fpclass_note]
+	</description>
+	<operation>
+k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
+k[MAX:1] := 0
+	</operation>
+	<instruction name="vfpclassss"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_fpclass_ss_mask">
+	<type>Floating Point</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set).
+	[fpclass_note]
+	</description>
+	<operation>
+IF k1[0]
+	k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
+ELSE
+	k[0] := 0
+FI
+k[MAX:1] := 0
+	</operation>
+	<instruction name="vfpclassss"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mmask_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:4] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgatherdpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mmask_i32gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:2] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgatherdpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mmask_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:8] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgatherdps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mmask_i32gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:4] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgatherdps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mmask_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:4] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgatherqpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mmask_i64gather_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>
+	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:2] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgatherqpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm256_mmask_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:4] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgatherqps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mmask_i64gather_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>
+	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:2] := 0
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vgatherqps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetexppd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetexppd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetexppd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetexppd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetexppd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_getexp_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ConvertExpFP64(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetexppd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetexpps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetexpps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetexpps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetexpps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetexpps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_getexp_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ConvertExpFP32(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetexpps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetmantpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetmantpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetmantpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetmantpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetmantpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_getmant_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetmantpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetmantps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetmantps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vgetmantps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetmantps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetmantps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_getmant_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="interv" type="_MM_MANTISSA_NORM_ENUM"/>
+	<parameter varname="sc" type="_MM_MANTISSA_SIGN_ENUM"/>
+	<description>Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "&#xB1;(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign.
+	[getmant_note]</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vgetmantps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinsertf32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinsertf32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_insertf32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinsertf32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_insertf32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".
+	</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE (imm8[7:0]) OF
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinsertf32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_insertf32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[7:0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinsertf32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_insertf32x8">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[7:0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinsertf32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE imm8[7:0] of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinsertf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinsertf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinsertf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE imm8[7:0] of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinsertf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinsertf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_insertf64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinsertf64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_inserti32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinserti32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_inserti32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinserti32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_inserti32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinserti32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_inserti32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE imm8[7:0] of
+0: dst[255:0] := b[255:0]
+1: dst[511:256] := b[255:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinserti32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_inserti32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[7:0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinserti32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_inserti32x8">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[7:0]) OF
+0: tmp[255:0] := b[255:0]
+1: tmp[511:256] := b[255:0]
+ESAC
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinserti32x8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_inserti64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[255:0] := a[255:0]
+CASE imm8[7:0] of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+ESAC
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinserti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_inserti64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinserti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_inserti64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[255:0] := a[255:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vinserti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_inserti64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8".</description>
+	<operation>
+dst[511:0] := a[511:0]
+CASE imm8[7:0] of
+0: dst[127:0] := b[127:0]
+1: dst[255:128] := b[127:0]
+2: dst[383:256] := b[127:0]
+3: dst[511:384] := b[127:0]
+ESAC
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinserti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_inserti64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinserti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_inserti64x2">
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8".  Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	</description>
+	<operation>
+tmp[511:0] := a[511:0]
+CASE (imm8[1:0]) of
+0: tmp[127:0] := b[127:0]
+1: tmp[255:128] := b[127:0]
+2: tmp[383:256] := b[127:0]
+3: tmp[511:384] := b[127:0]
+ESAC
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vinserti64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmaxpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmaxpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmaxpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_max_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmaxpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmaxps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmaxps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmaxps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_max_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmaxps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vminpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vminpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vminpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_min_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vminpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vminps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vminps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vminps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_min_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vminps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovapd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovapd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_store_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovapd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovapd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovapd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovapd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovapd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_store_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovapd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_load_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovapd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_mov_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovapd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovaps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovaps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_store_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovaps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovaps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovaps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovaps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovaps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_store_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovaps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_load_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovaps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_mov_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovaps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovddup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+tmp[191:128] := a[191:128]
+tmp[255:192] := a[191:128]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovddup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovddup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_movedup_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp[63:0] := a[63:0]
+tmp[127:64] := a[63:0]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovddup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqa32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqa32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_store_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqa32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqa32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqa32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqa32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqa32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_store_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqa32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_load_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqa32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mov_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqa32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqa64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqa64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_store_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqa64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqa64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqa64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqa64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqa64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_store_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqa64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_load_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). 
+	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqa64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mov_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqa64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Store packed 16-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Store packed 16-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Store packed 16-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_loadu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mov_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu16"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Store packed 32-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_loadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu32"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_storeu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqu64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_storeu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Store packed 64-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqu64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_loadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu64"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Store packed 8-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Store packed 8-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). 
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Store packed 8-bit integers from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_loadu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mov_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovdqu8"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovshdup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+tmp[159:128] := a[191:160] 
+tmp[191:160] := a[191:160] 
+tmp[223:192] := a[255:224] 
+tmp[255:224] := a[255:224]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovshdup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovshdup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_movehdup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp[31:0] := a[63:32] 
+tmp[63:32] := a[63:32] 
+tmp[95:64] := a[127:96] 
+tmp[127:96] := a[127:96]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovshdup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovsldup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+tmp[159:128] := a[159:128] 
+tmp[191:160] := a[159:128] 
+tmp[223:192] := a[223:192] 
+tmp[255:224] := a[223:192]
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0	
+	</operation>
+	<instruction name="vmovsldup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR	
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovsldup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_moveldup_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Move</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp[31:0] := a[31:0] 
+tmp[63:32] := a[31:0] 
+tmp[95:64] := a[95:64] 
+tmp[127:96] := a[95:64]
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0	
+	</operation>
+	<instruction name="vmovsldup"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary. </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovupd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_storeu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovupd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovupd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary. </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovupd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_storeu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovupd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_loadu_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovupd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovups"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_storeu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovups"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmovups"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovups"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_storeu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="mem_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k".
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vmovups"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_loadu_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	"mem_addr" does not need to be aligned on any particular boundary.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmovups"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmulpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmulpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmulpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_mul_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] * b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmulpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  RM.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmulps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vmulps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).  RM.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmulps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_mul_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vmulps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_or_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_or_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpabsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpabsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := ABS(a[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpabsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpabsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpabsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpabsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_abs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := ABS(a[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpabsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpabsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpabsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpabsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_abs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ABS(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpabsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Compute the absolute value of packed 64-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ABS(a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpabsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Compute the absolute value of packed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpabsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Compute the absolute value of packed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpabsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Compute the absolute value of packed 64-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ABS(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpabsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Compute the absolute value of packed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpabsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_abs_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Compute the absolute value of packed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ABS(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpabsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpabsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpabsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := ABS(a[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpabsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpabsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpabsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpabsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_abs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ABS(a[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpabsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
+tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
+tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
+tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
+tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
+tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
+tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
+tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
+tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpackssdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
+tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
+tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
+tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
+tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
+tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
+tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
+tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
+tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpackssdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
+tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
+tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
+tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
+tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
+tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
+tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
+tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
+tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
+tmp_dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
+tmp_dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
+tmp_dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
+tmp_dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
+tmp_dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
+tmp_dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
+tmp_dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
+tmp_dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
+tmp_dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
+tmp_dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
+tmp_dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
+tmp_dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
+tmp_dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
+tmp_dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
+tmp_dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
+tmp_dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackssdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
+tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
+tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
+tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
+tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
+tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
+tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
+tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
+tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
+tmp_dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
+tmp_dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
+tmp_dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
+tmp_dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
+tmp_dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
+tmp_dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
+tmp_dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
+tmp_dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
+tmp_dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
+tmp_dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
+tmp_dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
+tmp_dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
+tmp_dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
+tmp_dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
+tmp_dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
+tmp_dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackssdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
+dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
+dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
+dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
+dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
+dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
+dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
+dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
+dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
+dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
+dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
+dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
+dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
+dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
+dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
+dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
+dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
+dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
+dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
+dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
+dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
+dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
+dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
+dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
+dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
+dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
+dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
+dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
+dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
+dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
+dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
+dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackssdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpackssdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_packs_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpackssdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
+tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
+tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
+tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
+tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
+tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
+tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
+tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
+tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
+tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
+tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
+tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
+tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
+tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
+tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
+tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
+tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
+
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpacksswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
+tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
+tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
+tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
+tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
+tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
+tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
+tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
+tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
+tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
+tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
+tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
+tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
+tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
+tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
+tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
+tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
+
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpacksswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
+tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
+tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
+tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
+tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
+tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
+tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
+tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
+tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
+tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
+tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
+tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
+tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
+tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
+tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
+tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
+tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
+tmp_dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
+tmp_dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
+tmp_dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
+tmp_dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
+tmp_dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
+tmp_dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
+tmp_dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
+tmp_dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
+tmp_dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
+tmp_dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
+tmp_dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
+tmp_dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
+tmp_dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
+tmp_dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
+tmp_dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
+tmp_dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
+tmp_dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
+tmp_dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
+tmp_dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
+tmp_dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
+tmp_dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
+tmp_dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
+tmp_dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
+tmp_dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
+tmp_dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
+tmp_dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
+tmp_dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
+tmp_dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
+tmp_dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
+tmp_dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
+tmp_dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
+tmp_dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])
+
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpacksswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
+tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
+tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
+tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
+tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
+tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
+tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
+tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
+tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
+tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
+tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
+tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
+tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
+tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
+tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
+tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
+tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
+tmp_dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
+tmp_dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
+tmp_dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
+tmp_dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
+tmp_dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
+tmp_dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
+tmp_dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
+tmp_dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
+tmp_dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
+tmp_dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
+tmp_dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
+tmp_dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
+tmp_dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
+tmp_dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
+tmp_dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
+tmp_dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
+tmp_dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
+tmp_dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
+tmp_dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
+tmp_dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
+tmp_dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
+tmp_dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
+tmp_dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
+tmp_dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
+tmp_dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
+tmp_dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
+tmp_dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
+tmp_dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
+tmp_dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
+tmp_dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
+tmp_dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
+tmp_dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])
+
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpacksswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".
+	</description>
+	<operation>
+dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
+dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
+dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
+dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
+dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
+dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
+dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
+dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
+dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
+dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
+dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
+dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
+dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
+dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
+dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
+dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
+dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
+dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
+dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
+dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
+dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
+dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
+dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
+dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
+dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
+dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
+dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
+dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
+dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
+dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
+dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
+dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
+dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
+dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
+dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
+dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
+dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
+dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
+dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
+dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
+dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
+dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
+dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
+dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
+dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
+dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
+dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
+dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
+dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
+dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
+dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
+dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
+dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
+dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
+dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
+dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
+dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
+dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
+dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
+dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
+dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
+dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
+dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
+dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpacksswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
+
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpacksswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_packs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
+
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpacksswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
+tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
+tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
+tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
+tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
+tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
+tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
+tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
+tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpackusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
+tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
+tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
+tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
+tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
+tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
+tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
+tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
+tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpackusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
+tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
+tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
+tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
+tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
+tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
+tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
+tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
+tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
+tmp_dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
+tmp_dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
+tmp_dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
+tmp_dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
+tmp_dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
+tmp_dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
+tmp_dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
+tmp_dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
+tmp_dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
+tmp_dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
+tmp_dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
+tmp_dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
+tmp_dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
+tmp_dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
+tmp_dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
+tmp_dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
+tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
+tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
+tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
+tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
+tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
+tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
+tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
+tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
+tmp_dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
+tmp_dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
+tmp_dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
+tmp_dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
+tmp_dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
+tmp_dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
+tmp_dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
+tmp_dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
+tmp_dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
+tmp_dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
+tmp_dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
+tmp_dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
+tmp_dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
+tmp_dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
+tmp_dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
+tmp_dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
+dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
+dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
+dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
+dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
+dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
+dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
+dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
+dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
+dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
+dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
+dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
+dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
+dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
+dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
+dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
+dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
+dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
+dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
+dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
+dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
+dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
+dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
+dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
+dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
+dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
+dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
+dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
+dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
+dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
+dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
+dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpackusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_packus_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
+tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
+tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
+tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
+tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
+tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
+tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
+tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpackusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
+tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
+tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
+tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
+tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
+tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
+tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
+tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
+tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
+tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
+tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
+tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
+tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
+tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
+tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
+tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
+tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
+
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpackuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
+tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
+tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
+tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
+tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
+tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
+tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
+tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
+tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
+tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
+tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
+tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
+tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
+tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
+tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
+tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
+tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
+
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpackuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
+tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
+tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
+tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
+tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
+tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
+tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
+tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
+tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
+tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
+tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
+tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
+tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
+tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
+tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
+tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
+tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
+tmp_dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
+tmp_dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
+tmp_dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
+tmp_dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
+tmp_dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
+tmp_dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
+tmp_dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
+tmp_dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
+tmp_dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
+tmp_dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
+tmp_dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
+tmp_dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
+tmp_dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
+tmp_dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
+tmp_dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
+tmp_dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
+tmp_dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
+tmp_dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
+tmp_dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
+tmp_dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
+tmp_dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
+tmp_dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
+tmp_dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
+tmp_dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
+tmp_dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
+tmp_dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
+tmp_dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
+tmp_dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
+tmp_dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
+tmp_dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
+tmp_dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
+tmp_dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])
+
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
+tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
+tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
+tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
+tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
+tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
+tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
+tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
+tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
+tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
+tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
+tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
+tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
+tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
+tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
+tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
+tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
+tmp_dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
+tmp_dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
+tmp_dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
+tmp_dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
+tmp_dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
+tmp_dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
+tmp_dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
+tmp_dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
+tmp_dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
+tmp_dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
+tmp_dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
+tmp_dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
+tmp_dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
+tmp_dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
+tmp_dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
+tmp_dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
+tmp_dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
+tmp_dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
+tmp_dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
+tmp_dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
+tmp_dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
+tmp_dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
+tmp_dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
+tmp_dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
+tmp_dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
+tmp_dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
+tmp_dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
+tmp_dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
+tmp_dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
+tmp_dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
+tmp_dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
+tmp_dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])
+
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
+	<operation>
+dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
+dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
+dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
+dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
+dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
+dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
+dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
+dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
+dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
+dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
+dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
+dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
+dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
+dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
+dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
+dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
+dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
+dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
+dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
+dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
+dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
+dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
+dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
+dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
+dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
+dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
+dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
+dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
+dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
+dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
+dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
+dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
+dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
+dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
+dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
+dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
+dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
+dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
+dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
+dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
+dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
+dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
+dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
+dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
+dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
+dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
+dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
+dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
+dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
+dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
+dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
+dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
+dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
+dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
+dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
+dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
+dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
+dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
+dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
+dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
+dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
+dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
+dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
+dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpackuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
+
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpackuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_packus_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
+tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
+tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
+tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
+tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
+tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
+tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
+tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
+tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
+tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
+tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
+tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
+tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
+tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
+tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
+tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
+
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpackuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[i+7:i] + b[i+7:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_add_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] + b[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_add_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] + b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] :=0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_add_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] + b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_adds_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_adds_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_adds_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_adds_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpaddw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[i+15:i] + b[i+15:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpaddw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_add_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] + b[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpaddw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128) OR b[i+127:i]) &gt;&gt; (count[7:0]*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpalignr"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128) OR b[i+127:i]) &gt;&gt; (count[7:0]*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpalignr"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128) OR b[i+127:i]) &gt;&gt; (count[7:0]*8)
+	dst[i+127:i] := tmp[127:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpalignr"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128) OR b[i+127:i]) &gt;&gt; (count[7:0]*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpalignr"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*128
+	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128) OR b[i+127:i]) &gt;&gt; (count[7:0]*8)
+	tmp_dst[i+127:i] := tmp[127:0]
+ENDFOR
+
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpalignr"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+tmp_dst[255:0] := ((a[127:0] &lt;&lt; 128) OR b[127:0]) &gt;&gt; (count[7:0]*8)
+
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpalignr"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_alignr_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="count" type="const int"/>
+	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[255:0] := ((a[127:0] &lt;&lt; 128) OR b[127:0]) &gt;&gt; (count[7:0]*8)
+
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpalignr"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_and_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpandnd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpandnd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpandnd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_andnot_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpandnd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpandnq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpandnq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpandnq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_andnot_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpandnq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_and_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpavgb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpavgb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpavgb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpavgb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpavgb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpavgb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_avg_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpavgb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpavgw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpavgw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpavgw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpavgw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpavgw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpavgw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_avg_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpavgw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_blend_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpblendmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_blend_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpblendmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_blend_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := b[i+7:i]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpblendmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_blend_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpblendmd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_blend_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := b[i+31:i]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpblendmd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_blend_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpblendmq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_blend_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := b[i+63:i]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpblendmq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_blend_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpblendmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_blend_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpblendmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_blend_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := b[i+15:i]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpblendmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="char"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="char"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[7:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="char"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="char"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="char"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_broadcastb_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_set1_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="char"/>
+	<description>Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[7:0]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="int"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="int"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="int"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_broadcastd_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_set1_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="int"/>
+	<description>Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_broadcastmb_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ZeroExtend(k[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastmb2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_broadcastmb_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ZeroExtend(k[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastmb2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_broadcastmw_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ZeroExtend(k[15:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastmw2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_broadcastmw_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ZeroExtend(k[15:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastmw2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__int64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__int64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__int64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_broadcastq_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_set1_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__int64"/>
+	<description>Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="short"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="short"/>
+	<description>Broadcast 16-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[15:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="short"/>
+	<description>Broadcast 16-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="short"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="short"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_broadcastw_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_set1_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Set</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="short"/>
+	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpbroadcastw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmp_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmpeq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmpge_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmpgt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmple_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmplt_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmpneq_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpeq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpge_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpgt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	parameter varname='b' type='__m128i'/&gt;
+	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmple_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	parameter varname='b' type='__m128i'/&gt;
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmplt_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	parameter varname='b' type='__m128i'/&gt;
+	<description>Compare packed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpneq_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	parameter varname='b' type='__m128i'/&gt;
+	<description>Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpeq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpge_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpgt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmple_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmplt_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpneq_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmp_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmpeq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmpge_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmpgt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &gt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmple_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt;= b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmplt_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] &lt; b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_cmpneq_epu8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpeq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpge_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpgt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	parameter varname='b' type='__m128i'/&gt;
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &gt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmple_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	parameter varname='b' type='__m128i'/&gt;
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmplt_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	parameter varname='b' type='__m128i'/&gt;
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] &lt; b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpneq_epu32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	parameter varname='b' type='__m128i'/&gt;
+	<description>Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const _MM_CMPINT_ENUM"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpeq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpge_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpgt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &gt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmple_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt;= b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmplt_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] &lt; b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpneq_epu64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpcmpuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+			k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpeq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpge_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpgt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;== b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmple_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmplt_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpneq_epu16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k".</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &gt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmp_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Compare packed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+CASE (imm8[7:0]) OF
+0: OP := _MM_CMPINT_EQ
+1: OP := _MM_CMPINT_LT
+2: OP := _MM_CMPINT_LE
+3: OP := _MM_CMPINT_FALSE
+4: OP := _MM_CMPINT_NE
+5: OP := _MM_CMPINT_NLT
+6: OP := _MM_CMPINT_NLE
+7: OP := _MM_CMPINT_TRUE
+ESAC
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpeq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpge_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpgt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &gt;== b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmple_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt;= b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmplt_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] &lt; b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_cmpneq_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k1" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
+	ELSE 
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpcmpw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpcompressd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_compressstoreu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vpcompressd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpcompressd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpcompressd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_compressstoreu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 32
+m := base_addr
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vpcompressd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_compress_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 32
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[m+size-1:m] := a[i+31:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpcompressd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := src[255:m]
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpcompressq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_compressstoreu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vpcompressq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[255:m] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpcompressq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src".</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := src[127:m]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpcompressq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_compressstoreu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+size := 64
+m := base_addr
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+	</operation>
+	<instruction name="vpcompressq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_compress_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero.</description>
+	<operation>
+size := 64
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[m+size-1:m] := a[i+63:i]
+		m := m + size
+	FI
+ENDFOR
+dst[127:m] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpcompressq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	FOR k := 0 to j-1
+		m := k*32
+		dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+	ENDFOR
+	dst[i+31:i+j] := 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpconflictd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>	
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[i]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpconflictd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[i]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpconflictd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	FOR k := 0 to j-1
+		m := k*32
+		dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+	ENDFOR
+	dst[i+31:i+j] := 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpconflictd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>	
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[i]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpconflictd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_conflict_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[i]
+		FOR l := 0 to j-1
+			m := l*32
+			dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
+		ENDFOR
+		dst[i+31:i+j] := 0
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpconflictd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	FOR k := 0 to j-1
+		m := k*64
+		dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+	ENDFOR
+	dst[i+63:i+j] := 0
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpconflictq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpconflictq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpconflictq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	FOR k := 0 to j-1
+		m := k*64
+		dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+	ENDFOR
+	dst[i+63:i+j] := 0
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpconflictq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpconflictq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_conflict_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Compare</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR l := 0 to j-1
+			m := l*64
+			dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
+		ENDFOR
+		dst[i+63:i+j] := 0
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpconflictq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_permutexvar_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask2_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+3:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermt2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2d"/>
+	<instruction name="vpermt2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2d"/>
+	<instruction name="vpermt2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask2_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermt2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2d"/>
+	<instruction name="vpermt2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_permutex2var_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+2:i]*32
+	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2d"/>
+	<instruction name="vpermt2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask2_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermt2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2pd"/>
+	<instruction name="vpermt2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2pd"/>
+	<instruction name="vpermt2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask2_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set)</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermt2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2pd"/>
+	<instruction name="vpermt2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_permutex2var_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2pd"/>
+	<instruction name="vpermt2pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask2_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermt2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2ps"/>
+	<instruction name="vpermt2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	off := idx[i+2:i]*32
+	dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2ps"/>
+	<instruction name="vpermt2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask2_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := idx[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := a[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermt2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	IF k[j]
+		dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2ps"/>
+	<instruction name="vpermt2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_permutex2var_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	off := idx[i+1:i]*32
+	dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2ps"/>
+	<instruction name="vpermt2ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask2_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermt2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2q"/>
+	<instruction name="vpermt2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	off := idx[i+1:i]*64
+	dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2q"/>
+	<instruction name="vpermt2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask2_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := idx[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermt2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	IF k[j]
+		dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2q"/>
+	<instruction name="vpermt2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_permutex2var_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	off := idx[i]*64
+	dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2q"/>
+	<instruction name="vpermt2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask2_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+3:i]
+		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := idx[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+3:i]
+		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermt2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+3:i]
+		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2w"/>
+	<instruction name="vpermt2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	off := 16*idx[i+3:i]
+	dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2w"/>
+	<instruction name="vpermt2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask2_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+4:i]
+		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := idx[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermi2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+4:i]
+		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermt2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+4:i]
+		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermi2w"/>
+	<instruction name="vpermt2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	off := 16*idx[i+4:i]
+	dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermi2w"/>
+	<instruction name="vpermt2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask2_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+2:i]
+		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := idx[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+2:i]
+		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := a[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermt2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		off := 16*idx[i+2:i]
+		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2w"/>
+	<instruction name="vpermt2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_permutex2var_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	off := 16*idx[i+2:i]
+	dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2w"/>
+	<instruction name="vpermt2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermilpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermilpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
+IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
+IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
+IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
+IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermilpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
+IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
+IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
+IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
+IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermilpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermilpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermilpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_permute_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
+IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
+IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
+IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermilpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_permutevar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
+IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
+IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
+IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermilpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermilps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermilps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermilps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
+tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
+tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
+tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermilps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermilps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermilps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_permute_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermilps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_permutevar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
+tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
+tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermilps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_permutex_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_permutexvar_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	IF k[j]
+		dst[i+31:i] := a[id+31:id]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_permutexvar_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	id := idx[i+2:i]*32
+	dst[i+31:i] := a[id+31:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 64-bit integers in "a" across lanes lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	IF k[j]
+		dst[i+63:i] := a[id+63:id]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_permutex_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[63:0] := src[63:0]
+	1:	tmp[63:0] := src[127:64]
+	2:	tmp[63:0] := src[191:128]
+	3:	tmp[63:0] := src[255:192]
+	ESAC
+	RETURN tmp[63:0]
+}
+
+dst[63:0] := SELECT4(a[255:0], imm8[1:0])
+dst[127:64] := SELECT4(a[255:0], imm8[3:2])
+dst[191:128] := SELECT4(a[255:0], imm8[5:4])
+dst[255:192] := SELECT4(a[255:0], imm8[7:6])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_permutexvar_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	id := idx[i+1:i]*64
+	dst[i+63:i] := a[id+63:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	id := idx[i+3:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	id := idx[i+3:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	id := idx[i+3:i]*16
+	dst[i+15:i] := a[id+15:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	id := idx[i+4:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	id := idx[i+4:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	id := idx[i+4:i]*16
+	dst[i+15:i] := a[id+15:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	id := idx[i+2:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	id := idx[i+2:i]*16
+	IF k[j]
+		dst[i+15:i] := a[id+15:id]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_permutexvar_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	id := idx[i+2:i]*16
+	dst[i+15:i] := a[id+15:id]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpexpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpexpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpexpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpexpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpexpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpexpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_expand_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[m+31:m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpexpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_expandloadu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
+		m := m + 32
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpexpandd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpexpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpexpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpexpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpexpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpexpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpexpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_expand_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[m+63:m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpexpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_expandloadu_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="mem_addr" type="void const*"/>
+	<description>Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+m := 0
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
+		m := m + 64
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpexpandq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mmask_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>
+	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:8] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpgatherdd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mmask_i32gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>
+	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:4] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpgatherdd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mmask_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	m := j*32
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:4] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpgatherdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mmask_i32gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	m := j*32
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:2] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpgatherdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mmask_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:4] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpgatherqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mmask_i64gather_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	m := j*64
+	IF k[j]
+		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+k[MAX:2] := 0
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpgatherqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mmask_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:4] := 0
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpgatherqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mmask_i64gather_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Load</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="base_addr" type="void const*"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
+		k[j] := 0
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+k[MAX:2] := 0
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpgatherqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	tmp := 31
+	dst[i+31:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+31:i] := dst[i+31:i] + 1
+	OD
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vplzcntd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vplzcntd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vplzcntd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	tmp := 31
+	dst[i+31:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+31:i] := dst[i+31:i] + 1
+	OD
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vplzcntd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vplzcntd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_lzcnt_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp := 31
+		dst[i+31:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+31:i] := dst[i+31:i] + 1
+		OD
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vplzcntd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp := 63
+	dst[i+63:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+63:i] := dst[i+63:i] + 1
+	OD
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vplzcntq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vplzcntq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vplzcntq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp := 63
+	dst[i+63:i] := 0
+	DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+		tmp := tmp - 1
+		dst[i+63:i] := dst[i+63:i] + 1
+	OD
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vplzcntq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vplzcntq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_lzcnt_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512CD</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp := 63
+		dst[i+63:i] := 0
+		DO WHILE (tmp &gt;= 0 AND a[i+tmp] == 0)
+			tmp := tmp - 1
+			dst[i+63:i] := dst[i+63:i] + 1
+		OD
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vplzcntq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaddubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaddubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaddubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaddubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaddubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaddubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_maddubs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaddubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaddwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaddwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaddwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaddwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaddwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaddwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_madd_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaddwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst". </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF a[i+7:i] &gt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_max_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_max_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF a[i+63:i] &gt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_max_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF a[i+63:i] &gt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF a[i+15:i] &gt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_max_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF a[i+7:i] &gt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_max_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &gt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_max_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &gt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF a[i+63:i] &gt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &gt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_max_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF a[i+63:i] &gt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmaxuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF a[i+15:i] &gt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmaxuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_max_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &gt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmaxuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF a[i+7:i] &lt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_min_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_min_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminsd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF a[i+63:i] &lt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_min_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF a[i+63:i] &lt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminsq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF a[i+15:i] &lt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_min_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF a[i+7:i] &lt; b[i+7:i]
+		dst[i+7:i] := a[i+7:i]
+	ELSE
+		dst[i+7:i] := b[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_min_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF a[i+7:i] &lt; b[i+7:i]
+			dst[i+7:i] := a[i+7:i]
+		ELSE
+			dst[i+7:i] := b[i+7:i]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminub"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_min_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF a[i+31:i] &lt; b[i+31:i]
+			dst[i+31:i] := a[i+31:i]
+		ELSE
+			dst[i+31:i] := b[i+31:i]
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminud"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF a[i+63:i] &lt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+   </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF a[i+63:i] &lt; b[i+63:i]
+			dst[i+63:i] := a[i+63:i]
+		ELSE
+			dst[i+63:i] := b[i+63:i]
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_min_epu64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF a[i+63:i] &lt; b[i+63:i]
+		dst[i+63:i] := a[i+63:i]
+	ELSE
+		dst[i+63:i] := b[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminuq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpminuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF a[i+15:i] &lt; b[i+15:i]
+		dst[i+15:i] := a[i+15:i]
+	ELSE
+		dst[i+15:i] := b[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpminuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_min_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF a[i+15:i] &lt; b[i+15:i]
+			dst[i+15:i] := a[i+15:i]
+		ELSE
+			dst[i+15:i] := b[i+15:i]
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpminuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_movepi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF a[i+7]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovb2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_movepi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF a[i+7]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovb2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_movepi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF a[i+7]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpmovb2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_movepi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF a[i+31]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpmovd2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm512_movepi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF a[i+31]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpmovd2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_movepi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF a[i+31]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpmovd2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate_Int32_To_Int8(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate_Int32_To_Int8(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate_Int32_To_Int16(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate_Int32_To_Int16(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_movm_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<description>Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := 0xFF
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovm2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_movm_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask64"/>
+	<description>Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := 0xFF
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmovm2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_movm_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := 0xFF
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovm2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_movm_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 0xFFFFFFFF
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovm2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_movm_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 0xFFFFFFFF
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmovm2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_movm_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := 0xFFFFFFFF
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovm2d"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_movm_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 0xFFFFFFFFffffffff
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovm2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_movm_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 0xFFFFFFFFffffffff
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmovm2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_movm_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := 0xFFFFFFFFffffffff
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovm2q"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_movm_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<description>Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 0xFFFF
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovm2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_movm_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<description>Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 0xFFFF
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmovm2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_movm_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<description>Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := 0xFFFF
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovm2w"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_movepi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF a[i+63]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpmovq2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm512_movepi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF a[i+63]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpmovq2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_movepi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF a[i+63]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpmovq2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate_Int64_To_Int8(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate_Int64_To_Int8(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Truncate_Int64_To_Int32(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Truncate_Int64_To_Int32(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate_Int64_To_Int16(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Truncate_Int64_To_Int16(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtsepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_Int32_To_Int8(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtsepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_Int32_To_Int8(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtsepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtsepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_Int32_To_Int16(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtsepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_Int32_To_Int16(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtsepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtsepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_Int64_To_Int8(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="vpmovsqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="vpmovsqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtsepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_Int64_To_Int8(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="vpmovsqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtsepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="vpmovsqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtsepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate_Int64_To_Int32(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtsepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate_Int64_To_Int32(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtsepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtsepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_Int64_To_Int16(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovsqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtsepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_Int64_To_Int16(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtsepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovsqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtsepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_Int16_To_Int8(a[i+15:i])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtsepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_Int16_To_Int8(a[i+15:i])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtsepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_Int16_To_Int8(a[i+15:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtsepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxbd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxbd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxbd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxbd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxbq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxbq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxbq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxbq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	dst[l+15:l] := SignExtend(a[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmovsxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmovsxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmovsxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := SignExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+31:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	l := j*16
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*16
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxwq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovsxwq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxwq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovsxwq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtusepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 8*j
+	dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtusepi32_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtusepi32_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusdb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtusepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	k := 16*j
+	dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtusepi32_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtusepi32_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusdw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtusepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 8*j
+	dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="vpmovusqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="vpmovusqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtusepi64_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="vpmovusqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtusepi64_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:16] := 0
+	</operation>
+	<instruction name="vpmovusqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovusqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovusqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtusepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovusqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovusqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 32*j
+	dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := src[l+31:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtusepi64_storeu_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		MEM[base_addr+l+31:base_addr+l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtusepi64_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
+	ELSE
+		dst[l+31:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtusepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovusqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	k := 16*j
+	dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtusepi64_storeu_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtusepi64_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovusqw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtusepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtusepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtusepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtusepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovuswb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_movepi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF a[i+15]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vpmovw2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_movepi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF a[i+15]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vpmovw2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_movepi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF a[i+15]
+		k[j] := 1
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpmovw2m"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_mask_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_cvtepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate_Int16_To_Int8(a[i+15:i])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm256_maskz_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_mask_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm512_mask_cvtepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate_Int16_To_Int8(a[i+15:i])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm512_maskz_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := src[l+7:l]
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_cvtepi16_storeu_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr".</description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		MEM[base_addr+l+7:base_addr+l] := Truncate_Int16_To_Int8(a[i+15:i])
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepi16_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 16*j
+	l := 8*j
+	IF k[j]
+		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
+	ELSE
+		dst[l+7:l] := 0
+	FI
+ENDFOR
+dst[MAX:64] := 0
+	</operation>
+	<instruction name="vpmovwb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxbd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxbd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxbd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepu8_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in th elow 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 8*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxbd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxbq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxbq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxbq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepu8_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 8*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+7:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxbq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="a" type="__m256i"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	dst[l+15:l] := ZeroExtend(a[i+7:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmovzxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmovzxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmovzxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := src[l+15:l]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepu8_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*8
+	l := j*16
+	IF k[j]
+		dst[l+15:l] := ZeroExtend(a[i+7:i])
+	ELSE
+		dst[l+15:l] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+31:l])
+	ELSE 
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+31:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepu32_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 32*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+31:l])
+	ELSE 
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepu16_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := 32*j
+	l := 16*j
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxwq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmovzxwq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxwq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_cvtepu16_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Convert</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := 64*j
+	l := 16*j
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[l+15:l])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmovzxwq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmuldq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmuldq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmuldq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mul_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmuldq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmulhrsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := 9
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmulhrsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhrsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := 9
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhrsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
+	dst[i+15:i] := tmp[16:1]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhrsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmulhrsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mulhrs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
+		dst[i+15:i] := tmp[16:1]
+	ELSE
+		dst[i+15:i] := 9
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmulhrsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmulhuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := o
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmulhuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := o
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmulhuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mulhi_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := o
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmulhuw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmulhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmulhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>	
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[31:16]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmulhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mulhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[31:16]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmulhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmulld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmulld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmulld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmulld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mullo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		tmp[63:0] := a[i+31:i] * b[i+31:i]
+		dst[i+31:i] := tmp[31:0]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmulld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmullq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmullq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := a[i+63:i] * b[i+63:i]
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmullq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmullq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmullq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := a[i+63:i] * b[i+63:i]
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmullq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmullq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := a[i+63:i] * b[i+63:i]
+		dst[i+63:i] := tmp[63:0]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmullq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mullo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := a[i+63:i] * b[i+63:i]
+	dst[i+63:i] := tmp[63:0]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmullq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmullw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmullw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmullw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmullw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	tmp[31:0] := a[i+15:i] * b[i+15:i]
+	dst[i+15:i] := tmp[15:0]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmullw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmullw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mullo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		tmp[31:0] := a[i+15:i] * b[i+15:i]
+		dst[i+15:i] := tmp[15:0]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmullw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmuludq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmuludq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmuludq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_mul_epu32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+31:i] * b[i+31:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmuludq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_or_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] OR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vporq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vporq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vporq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_or_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] OR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vporq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprold"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprold"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprold"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprold"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprold"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_rol_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprold"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprolq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprolq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprolq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprolq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprolq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_rol_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprolq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprolvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprolvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprolvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprolvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprolvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_rolv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprolvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprolvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprolvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprolvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprolvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprolvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_rolv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+LEFT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &lt;&lt; count) OR (src &gt;&gt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprolvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_ror_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprorq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprorq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprorq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprorq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprorq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_ror_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprorq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprorvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprorvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprorvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprorvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprorvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_rorv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_DWORDS(src, count_src){
+	count := count_src modulo 32
+	RETURN (src &gt;&gt;count) OR (src &lt;&lt; (32 - count))
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprorvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprorvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprorvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vprorvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprorvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprorvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_rorv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". </description>
+	<operation>
+RIGHT_ROTATE_QWORDS(src, count_src){
+	count := count_src modulo 64
+	RETURN (src &gt;&gt; count) OR (src &lt;&lt; (64 - count))
+}
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vprorvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sad_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce eight unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+ENDFOR
+FOR j := 0 to 7
+	i := j*64
+	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
+	dst[i+63:i+16] := 0
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsadbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="vpscatterdd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vpscatterdd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="vpscatterdd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_i32scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpscatterdd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="vpscatterdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpscatterdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="vpscatterdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_i32scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpscatterdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="vpscatterqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpscatterqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="vpscatterqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_i64scatter_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpscatterqd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="vpscatterqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vpscatterqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="vpscatterqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_i64scatter_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vpscatterqq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[4:0] := b[i+3:i] + (j &amp; 0x10)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpshufb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[4:0] := b[i+3:i] + (j &amp; 0x10)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpshufb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 8-bit integers in "a" within 128-bit lanes using the control in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[5:0] := b[i+3:i] + (j &amp; 0x30)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpshufb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[5:0] := b[i+3:i] + (j &amp; 0x30)
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpshufb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF b[i+7] == 1
+		dst[i+7:i] := 0
+	ELSE
+		index[5:0] := b[i+3:i] + (j &amp; 0x30)
+		dst[i+7:i] := a[index*8+7:index*8]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpshufb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[3:0] := b[i+3:i]
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpshufb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_shuffle_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		IF b[i+7] == 1
+			dst[i+7:i] := 0
+		ELSE
+			index[3:0] := b[i+3:i]
+			dst[i+7:i] := a[index*8+7:index*8]
+		FI
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpshufb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="_MM_PERM_ENUM"/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpshufd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="_MM_PERM_ENUM"/>
+	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpshufd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="_MM_PERM_ENUM"/>
+	<description>Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpshufd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_shuffle_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="_MM_PERM_ENUM"/>
+	<description>Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpshufd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpshufhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpshufhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+tmp_dst[319:256] := a[319:256]
+tmp_dst[335:320] := (a &gt;&gt; (imm8[1:0] * 16))[335:320]
+tmp_dst[351:336] := (a &gt;&gt; (imm8[3:2] * 16))[335:320]
+tmp_dst[367:352] := (a &gt;&gt; (imm8[5:4] * 16))[335:320]
+tmp_dst[383:368] := (a &gt;&gt; (imm8[7:6] * 16))[335:320]
+tmp_dst[447:384] := a[447:384]
+tmp_dst[463:448] := (a &gt;&gt; (imm8[1:0] * 16))[463:448]
+tmp_dst[479:464] := (a &gt;&gt; (imm8[3:2] * 16))[463:448]
+tmp_dst[495:480] := (a &gt;&gt; (imm8[5:4] * 16))[463:448]
+tmp_dst[511:496] := (a &gt;&gt; (imm8[7:6] * 16))[463:448]
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpshufhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+tmp_dst[191:128] := a[191:128]
+tmp_dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+tmp_dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+tmp_dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+tmp_dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+tmp_dst[319:256] := a[319:256]
+tmp_dst[335:320] := (a &gt;&gt; (imm8[1:0] * 16))[335:320]
+tmp_dst[351:336] := (a &gt;&gt; (imm8[3:2] * 16))[335:320]
+tmp_dst[367:352] := (a &gt;&gt; (imm8[5:4] * 16))[335:320]
+tmp_dst[383:368] := (a &gt;&gt; (imm8[7:6] * 16))[335:320]
+tmp_dst[447:384] := a[447:384]
+tmp_dst[463:448] := (a &gt;&gt; (imm8[1:0] * 16))[463:448]
+tmp_dst[479:464] := (a &gt;&gt; (imm8[3:2] * 16))[463:448]
+tmp_dst[495:480] := (a &gt;&gt; (imm8[5:4] * 16))[463:448]
+tmp_dst[511:496] := (a &gt;&gt; (imm8[7:6] * 16))[463:448]
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpshufhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[63:0] := a[63:0]
+dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+dst[191:128] := a[191:128]
+dst[207:192] := (a &gt;&gt; (imm8[1:0] * 16))[207:192]
+dst[223:208] := (a &gt;&gt; (imm8[3:2] * 16))[207:192]
+dst[239:224] := (a &gt;&gt; (imm8[5:4] * 16))[207:192]
+dst[255:240] := (a &gt;&gt; (imm8[7:6] * 16))[207:192]
+dst[319:256] := a[319:256]
+dst[335:320] := (a &gt;&gt; (imm8[1:0] * 16))[335:320]
+dst[351:336] := (a &gt;&gt; (imm8[3:2] * 16))[335:320]
+dst[367:352] := (a &gt;&gt; (imm8[5:4] * 16))[335:320]
+dst[383:368] := (a &gt;&gt; (imm8[7:6] * 16))[335:320]
+dst[447:384] := a[447:384]
+dst[463:448] := (a &gt;&gt; (imm8[1:0] * 16))[463:448]
+dst[479:464] := (a &gt;&gt; (imm8[3:2] * 16))[463:448]
+dst[495:480] := (a &gt;&gt; (imm8[5:4] * 16))[463:448]
+dst[511:496] := (a &gt;&gt; (imm8[7:6] * 16))[463:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpshufhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpshufhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_shufflehi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[63:0] := a[63:0]
+tmp_dst[79:64] := (a &gt;&gt; (imm8[1:0] * 16))[79:64]
+tmp_dst[95:80] := (a &gt;&gt; (imm8[3:2] * 16))[79:64]
+tmp_dst[111:96] := (a &gt;&gt; (imm8[5:4] * 16))[79:64]
+tmp_dst[127:112] := (a &gt;&gt; (imm8[7:6] * 16))[79:64]
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpshufhw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpshuflw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpshuflw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+tmp_dst[271:256] := (a &gt;&gt; (imm8[1:0] * 16))[271:256]
+tmp_dst[287:272] := (a &gt;&gt; (imm8[3:2] * 16))[271:256]
+tmp_dst[303:288] := (a &gt;&gt; (imm8[5:4] * 16))[271:256]
+tmp_dst[319:304] := (a &gt;&gt; (imm8[7:6] * 16))[271:256]
+tmp_dst[383:320] := a[383:320]
+tmp_dst[399:384] := (a &gt;&gt; (imm8[1:0] * 16))[399:384]
+tmp_dst[415:400] := (a &gt;&gt; (imm8[3:2] * 16))[399:384]
+tmp_dst[431:416] := (a &gt;&gt; (imm8[5:4] * 16))[399:384]
+tmp_dst[447:432] := (a &gt;&gt; (imm8[7:6] * 16))[399:384]
+tmp_dst[511:448] := a[511:448]
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpshuflw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+tmp_dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+tmp_dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+tmp_dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+tmp_dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+tmp_dst[255:192] := a[255:192]
+tmp_dst[271:256] := (a &gt;&gt; (imm8[1:0] * 16))[271:256]
+tmp_dst[287:272] := (a &gt;&gt; (imm8[3:2] * 16))[271:256]
+tmp_dst[303:288] := (a &gt;&gt; (imm8[5:4] * 16))[271:256]
+tmp_dst[319:304] := (a &gt;&gt; (imm8[7:6] * 16))[271:256]
+tmp_dst[383:320] := a[383:320]
+tmp_dst[399:384] := (a &gt;&gt; (imm8[1:0] * 16))[399:384]
+tmp_dst[415:400] := (a &gt;&gt; (imm8[3:2] * 16))[399:384]
+tmp_dst[431:416] := (a &gt;&gt; (imm8[5:4] * 16))[399:384]
+tmp_dst[447:432] := (a &gt;&gt; (imm8[7:6] * 16))[399:384]
+tmp_dst[511:448] := a[511:448]
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpshuflw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
+	<operation>
+dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+dst[127:64] := a[127:64]
+dst[143:128] := (a &gt;&gt; (imm8[1:0] * 16))[143:128]
+dst[159:144] := (a &gt;&gt; (imm8[3:2] * 16))[143:128]
+dst[175:160] := (a &gt;&gt; (imm8[5:4] * 16))[143:128]
+dst[191:176] := (a &gt;&gt; (imm8[7:6] * 16))[143:128]
+dst[255:192] := a[255:192]
+dst[271:256] := (a &gt;&gt; (imm8[1:0] * 16))[271:256]
+dst[287:272] := (a &gt;&gt; (imm8[3:2] * 16))[271:256]
+dst[303:288] := (a &gt;&gt; (imm8[5:4] * 16))[271:256]
+dst[319:304] := (a &gt;&gt; (imm8[7:6] * 16))[271:256]
+dst[383:320] := a[383:320]
+dst[399:384] := (a &gt;&gt; (imm8[1:0] * 16))[399:384]
+dst[415:400] := (a &gt;&gt; (imm8[3:2] * 16))[399:384]
+dst[431:416] := (a &gt;&gt; (imm8[5:4] * 16))[399:384]
+dst[447:432] := (a &gt;&gt; (imm8[7:6] * 16))[399:384]
+dst[511:448] := a[511:448]
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpshuflw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpshuflw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_shufflelo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+tmp_dst[15:0] := (a &gt;&gt; (imm8[1:0] * 16))[15:0]
+tmp_dst[31:16] := (a &gt;&gt; (imm8[3:2] * 16))[15:0]
+tmp_dst[47:32] := (a &gt;&gt; (imm8[5:4] * 16))[15:0]
+tmp_dst[63:48] := (a &gt;&gt; (imm8[7:6] * 16))[15:0]
+tmp_dst[127:64] := a[127:64]
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpshuflw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpslld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpslld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpslld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpslld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpslld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpslld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sll_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpslld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_slli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpslld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_bslli_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
+dst[255:128] := a[255:128] &lt;&lt; (tmp*8)
+dst[383:256] := a[383:256] &lt;&lt; (tmp*8)
+dst[511:384] := a[511:384] &lt;&lt; (tmp*8)
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpslldq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sll_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_slli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sllv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sllv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsllvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsllvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsllvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_sllv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sll_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_slli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsllw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrad"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrad"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrad"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrad"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrad"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrad"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sra_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrad"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srai_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := SignBit
+		ELSE
+			dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrad"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := SignBit
+	ELSE
+		dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := SignBit
+	ELSE
+		dst[i+63:i] := SignExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := SignBit
+		ELSE
+			dst[i+63:i] := SignExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_sra_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF count[63:0] &gt; 63
+		dst[i+63:i] := SignBit
+	ELSE
+		dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_srai_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF imm8[7:0] &gt; 63
+		dst[i+63:i] := SignBit
+	ELSE
+		dst[i+63:i] := SignExtend(a[i+63:i] &lt;&lt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsraq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsravd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsravd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsravd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srav_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsravd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsravq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsravq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsravq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsravq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsravq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_srav_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SignExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsravq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsravw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsravw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[i+15:i])	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsravw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsravw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsravw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[i+15:i])	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsravw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsravw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsravw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_srav_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[i+15:i])	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsravw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>	
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := SignBit
+	ELSE
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := SignBit
+	ELSE
+		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sra_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srai_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := SignBit
+		ELSE
+			dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsraw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srl_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF count[63:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srli_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		IF imm8[7:0] &gt; 31
+			dst[i+31:i] := 0
+		ELSE
+			dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrld"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_bsrli_epi128">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
+	<operation>
+tmp := imm8[7:0]
+IF tmp &gt; 15
+	tmp := 16
+FI
+dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
+dst[255:128] := a[255:128] &gt;&gt; (tmp*8)
+dst[383:256] := a[383:256] &gt;&gt; (tmp*8)
+dst[511:384] := a[511:384] &gt;&gt; (tmp*8)
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsrldq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srl_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF count[63:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srli_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		IF imm8[7:0] &gt; 63
+			dst[i+63:i] := 0
+		ELSE
+			dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srlv_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlvd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srlv_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlvq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[i+63:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m256i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsrlvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsrlvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m512i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsrlvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_srlv_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlvw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF count[63:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="imm8" type="unsigned int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF imm8[7:0] &gt; 15
+		dst[i+15:i] := 0
+	ELSE
+		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srl_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="count" type="__m128i"/>
+	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF count[63:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_srli_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Shift</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		IF imm8[7:0] &gt; 15
+			dst[i+15:i] := 0
+		ELSE
+			dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
+		FI
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsrlw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := a[i+7:i] - b[i+7:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sub_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := a[i+7:i] - b[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sub_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sub_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+		<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_subs_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubsb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_subs_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubsw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_subs_epu8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
+	ELSE
+		dst[i+7:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubusb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_subs_epu16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
+	ELSE
+		dst[i+15:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubusw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpsubw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	dst[i+15:i] := a[i+15:i] - b[i+15:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpsubw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_sub_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := a[i+15:i] - b[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpsubw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpternlogd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="c" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpternlogd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="c" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	FOR h := 0 to 31
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpternlogd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpternlogd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		FOR h := 0 to 31
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpternlogd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_ternarylogic_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	FOR h := 0 to 31
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpternlogd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpternlogq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="c" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpternlogq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="c" type="__m256i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	FOR h := 0 to 63
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpternlogq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "src", "a", and "b" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (src[i+h] &lt;&lt; 2) OR (a[i+h] &lt;&lt; 1) OR b[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpternlogq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		FOR h := 0 to 63
+			index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+			dst[i+h] := imm8[index[2:0]]
+		ENDFOR
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpternlogq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_ternarylogic_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="c" type="__m128i"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used to form a 3 bit index into "imm8", and the value at that bit in "imm8" is written to the corresponding bit in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	FOR h := 0 to 63
+		index[2:0] := (a[i+h] &lt;&lt; 2) OR (b[i+h] &lt;&lt; 1) OR c[i+h]
+		dst[i+h] := imm8[index[2:0]]
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpternlogq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vptestmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vptestmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vptestmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vptestmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vptestmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_test_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vptestmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vptestmd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vptestmd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vptestmd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_test_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vptestmd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vptestmq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vptestmq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vptestmq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_test_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vptestmq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vptestmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vptestmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vptestmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vptestmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vptestmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_test_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vptestmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_mask_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vptestnmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm256_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vptestnmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_mask_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vptestnmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask64" name="_mm512_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:64] := 0
+	</operation>
+	<instruction name="vptestnmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_mask_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k1[j]
+		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vptestnmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm_testn_epi8_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vptestnmb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vptestnmd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vptestnmd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k1[j]
+		k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vptestnmd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_testn_epi32_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	k[j] := ((a[i+31:i] NAND b[i+31:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vptestnmd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_mask_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vptestnmq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm256_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vptestnmq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k1[j]
+		k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vptestnmq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_testn_epi64_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vptestnmq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_mask_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vptestnmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask16" name="_mm256_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:16] := 0
+	</operation>
+	<instruction name="vptestnmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_mask_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vptestnmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask32" name="_mm512_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:32] := 0
+	</operation>
+	<instruction name="vptestnmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_mask_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="k1" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	IF k1[j]
+		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+	ELSE
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vptestnmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__mmask8" name="_mm_testn_epi16_mask">
+	<type>Integer</type>
+	<type>Mask</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Compare</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*16
+	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vptestnmw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpckhbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpckhbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
+
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpckhbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
+
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpckhbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpckhbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpckhbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_unpackhi_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[71:64] 
+	dst[15:8] := src2[71:64] 
+	dst[23:16] := src1[79:72] 
+	dst[31:24] := src2[79:72] 
+	dst[39:32] := src1[87:80] 
+	dst[47:40] := src2[87:80] 
+	dst[55:48] := src1[95:88] 
+	dst[63:56] := src2[95:88] 
+	dst[71:64] := src1[103:96] 
+	dst[79:72] := src2[103:96] 
+	dst[87:80] := src1[111:104] 
+	dst[95:88] := src2[111:104] 
+	dst[103:96] := src1[119:112] 
+	dst[111:104] := src2[119:112] 
+	dst[119:112] := src1[127:120] 
+	dst[127:120] := src2[127:120] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
+
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpckhbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpckhdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpckhdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpckhdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_unpackhi_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpckhdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpckhqdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpckhqdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpckhqdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_unpackhi_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpckhqdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpckhwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpckhwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpckhwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpckhwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+
+dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpckhwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpckhwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_unpackhi_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[79:64]
+	dst[31:16] := src2[79:64] 
+	dst[47:32] := src1[95:80] 
+	dst[63:48] := src2[95:80] 
+	dst[79:64] := src1[111:96] 
+	dst[95:80] := src2[111:96] 
+	dst[111:96] := src1[127:112] 
+	dst[127:112] := src2[127:112] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpckhwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpcklbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpcklbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
+
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpcklbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
+
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpcklbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpcklbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpcklbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_unpacklo_epi8">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
+	dst[7:0] := src1[7:0] 
+	dst[15:8] := src2[7:0] 
+	dst[23:16] := src1[15:8] 
+	dst[31:24] := src2[15:8] 
+	dst[39:32] := src1[23:16] 
+	dst[47:40] := src2[23:16] 
+	dst[55:48] := src1[31:24] 
+	dst[63:56] := src2[31:24] 
+	dst[71:64] := src1[39:32]
+	dst[79:72] := src2[39:32] 
+	dst[87:80] := src1[47:40] 
+	dst[95:88] := src2[47:40] 
+	dst[103:96] := src1[55:48] 
+	dst[111:104] := src2[55:48] 
+	dst[119:112] := src1[63:56] 
+	dst[127:120] := src2[63:56] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
+
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		dst[i+7:i] := tmp_dst[i+7:i]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpcklbw"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpckldq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpckldq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpckldq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_unpacklo_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpckldq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpcklqdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpcklqdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpcklqdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_unpacklo_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpcklqdq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpcklwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 15
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpunpcklwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpcklwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
+tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
+
+FOR j := 0 to 31
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpcklwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
+	<operation>
+INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}	
+
+dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
+dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
+dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpunpcklwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := src[i+15:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpcklwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_unpacklo_epi16">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512BW</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
+	dst[15:0] := src1[15:0] 
+	dst[31:16] := src2[15:0] 
+	dst[47:32] := src1[31:16] 
+	dst[63:48] := src2[31:16] 
+	dst[79:64] := src1[47:32] 
+	dst[95:80] := src2[47:32] 
+	dst[111:96] := src1[63:48] 
+	dst[127:112] := src2[63:48] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 7
+	i := j*16
+	IF k[j]
+		dst[i+15:i] := tmp_dst[i+15:i]
+	ELSE
+		dst[i+15:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpunpcklwd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpxord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpxord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpxord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_xor_epi32">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpxord"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpxorq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpxorq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpxorq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_xor_epi64">
+	<type>Integer</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpxorq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_range_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_range_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_range_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_range_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_range_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_range_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_range_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_range_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[63:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangeps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_range_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+IF k[0]
+	dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangesd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_range_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+IF k[0]
+	dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangesd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_range_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+IF k[0]
+	dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangesd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_range_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+IF k[0]
+	dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangesd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_range_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src1[63:0] : src2[63:0]
+	1: tmp[63:0] := (src1[63:0] &lt;= src2[63:0]) ? src2[63:0] : src1[63:0]
+	2: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
+	3: tmp[63:0] := (ABS(src1[63:0]) &lt;= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[63:0] := (src1[63] &lt;&lt; 63) OR (tmp[62:0])
+	1: dst[63:0] := tmp[63:0]
+	2: dst[63:0] := (0 &lt;&lt; 63) OR (tmp[62:0])
+	3: dst[63:0] := (1 &lt;&lt; 63) OR (tmp[62:0])
+	ESAC
+	
+	RETURN dst
+}
+
+dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
+dst[127:64] := a[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangesd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_range_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+IF k[0]
+	dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangess"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_range_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+IF k[0]
+	dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangess"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_range_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+IF k[0]
+	dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangess"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_range_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+IF k[0]
+	dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangess"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_range_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
+	imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
+	imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
+	[round_note]
+	</description>
+	<operation>
+RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
+{
+	CASE opCtl[1:0]
+	0: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src1[31:0] : src2[31:0]
+	1: tmp[31:0] := (src1[31:0] &lt;= src2[31:0]) ? src2[31:0] : src1[31:0]
+	2: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
+	3: tmp[31:0] := (ABS(src1[31:0]) &lt;= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
+	ESAC
+	
+	CASE signSelCtl[1:0]
+	0: dst[31:0] := (src1[31] &lt;&lt; 31) OR (tmp[30:0])
+	1: dst[31:0] := tmp[31:0]
+	2: dst[31:0] := (0 &lt;&lt; 31) OR (tmp[30:0])
+	3: dst[31:0] := (1 &lt;&lt; 31) OR (tmp[30:0])
+	ESAC
+	
+	RETURN dst
+}
+
+dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
+dst[127:32] := a[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrangess"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrcp14pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrcp14pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrcp14pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrcp14pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrcp14pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_rcp14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128d"/>
+	<description>Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrcp14pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrcp14ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrcp14ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrcp14ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrcp14ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrcp14ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_rcp14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128"/>
+	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrcp14ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". </description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_reduce_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_reduce_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". </description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_reduce_round_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_reduce_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". </description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst".</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>	
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_reduce_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_reduce_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	[round_note]</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst".</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_reduce_round_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst".
+	[round_note]</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_reduce_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst".</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreduceps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_reduce_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".</description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducesd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_reduce_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := src[63:0]
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducesd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_reduce_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".</description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducesd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_reduce_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+IF k[0]
+	dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
+ELSE
+	dst[63:0] := 0
+FI
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducesd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_reduce_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst".</description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducesd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_reduce_round_sd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst".
+	[round_note]</description>
+	<operation>
+ReduceArgumentPD(src1[63:0], imm8[7:0])
+{
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
+	tmp[63:0] := src1[63:0] - tmp[63:0]
+	RETURN tmp[63:0]
+}
+
+dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
+dst[127:64] := b[127:64]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducesd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_reduce_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:64] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducess"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_reduce_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := src[31:0]
+FI
+dst[127:64] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducess"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_reduce_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:64] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducess"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_reduce_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+
+IF k[0]
+	dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
+ELSE
+	dst[31:0] := 0
+FI
+dst[127:64] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducess"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_reduce_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst".</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
+dst[127:64] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducess"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_reduce_round_ss">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<parameter varname="rounding" type="int"/>
+	<description>Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "a" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst".
+	[round_note]</description>
+	<operation>
+ReduceArgumentPS(src1[31:0], imm8[7:0])
+{
+	IF src1[31:0] == NAN
+		RETURN (convert src1[31:0] to QNaN)
+	FI
+	
+	m := imm8[7:4] // number of fraction bits after the binary point to be preserved
+	rc := imm8[1:0] // round control
+	rc_src := imm8[2] // round ccontrol source
+	spe := 0
+	tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
+	tmp[31:0] := src1[31:0] - tmp[31:0]
+	RETURN tmp[31:0]
+}
+
+dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
+dst[127:64] := b[127:32]
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vreducess"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrndscalepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrndscalepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst".
+	</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrndscalepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrndscalepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrndscalepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_roundscale_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst".
+	</description>
+	<operation>
+RoundTo_IntegerPD(src[63:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
+	1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
+	2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
+	3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
+	ESAC
+	
+	dst[63:0] := 2^-M * tmp[63:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[63:0] != dst[63:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[63:0]
+}	
+
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrndscalepd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrndscaleps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrndscaleps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst".
+	</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrndscaleps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrndscaleps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrndscaleps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_roundscale_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="imm8" type="int"/>
+	<description>Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst".
+	</description>
+	<operation>
+RoundTo_IntegerPS(src[31:0], imm8[7:0]){
+	IF(imm8[2] == 1)
+		rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
+	ELSE
+		rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
+	FI
+	
+	M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
+	
+	CASE(rounding_direction)
+	0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
+	1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
+	2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
+	3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
+	ESAC
+	
+	dst[31:0] := 2^-M * tmp[31:0] // scale back down
+	
+	IF imm8[3] == 0 //check SPE
+		IF src[31:0] != dst[31:0] //check if precision has been lost
+			set_precision() //set #PE
+		FI
+	FI
+	RETURN dst[31:0]
+}	
+
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrndscaleps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrsqrt14pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrsqrt14pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrsqrt14pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_rsqrt14_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrsqrt14pd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrsqrt14ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vrsqrt14ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrsqrt14ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_rsqrt14_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vrsqrt14ps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vscalefpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vscalefpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 3
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vscalefpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vscalefpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vscalefpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_scalef_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
+	RETURN dst[63:0]
+}
+
+FOR j := 0 to 1
+	i := j*64
+	dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vscalefpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vscalefps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vscalefps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 7
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vscalefps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vscalefps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vscalefps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_scalef_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst".</description>
+	<operation>
+SCALE(src1, src2){
+	IF (src2 == NaN)
+		IF (src2 == SNaN)
+			RETURN QNAN(src2)
+		FI
+	ELSE IF (src1 == NaN)
+		IF (src1 == SNaN)
+			RETURN QNAN(src1)
+		FI
+		IF (src2 != INF)
+			RETURN QNAN(src1)
+		FI
+	ELSE
+		tmp_src2 := src2
+		tmp_src1 := src1
+		IF (src2 is denormal AND MXCSR.DAZ)
+			tmp_src2 := 0
+		FI
+		IF (src1 is denormal AND MXCSR.DAZ)
+			tmp_src1 := 0
+		FI
+	FI
+	dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
+	RETURN dst[31:0]
+}
+
+FOR j := 0 to 3
+	i := j*32
+	dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vscalefps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="vscatterdpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	l := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vscatterdpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="vscatterdpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_i32scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	l := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vscatterdpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="vscatterdps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:8] := 0
+	</operation>
+	<instruction name="vscatterdps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="vscatterdps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_i32scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vscatterdps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="vscatterqpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vscatterqpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+ENDFOR
+	</operation>
+	<instruction name="vscatterqpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_i64scatter_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vscatterqpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="vscatterqps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm256_mask_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m256i"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	l := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:4] := 0
+	</operation>
+	<instruction name="vscatterqps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+ENDFOR
+	</operation>
+	<instruction name="vscatterqps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="void" name="_mm_mask_i64scatter_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Store</category>
+	<parameter varname="base_addr" type="void*"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="vindex" type="__m128i"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="scale" type="const int"/>
+	<description>Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8.
+	</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*32
+	l := j*64
+	IF k[j]
+		MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
+		k[j] := 0
+	FI
+ENDFOR
+k[MAX:2] := 0
+	</operation>
+	<instruction name="vscatterqps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+SELECT2(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
+tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshuff32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT2(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
+tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshuff32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_shuffle_f32x4">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+SELECT2(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT2(a[255:0], imm8[0])
+dst[255:128] := SELECT2(b[255:0], imm8[1])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshuff32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
+tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshuff64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
+tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshuff64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_shuffle_f64x2">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT2(a[255:0], imm8[0])
+dst[255:128] := SELECT2(b[255:0], imm8[1])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshuff64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_shuffle_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+SELECT2(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
+tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshufi32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_shuffle_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT2(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
+tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshufi32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_shuffle_i32x4">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+SELECT2(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT2(a[255:0], imm8[1:0])
+dst[255:128] := SELECT2(b[255:0], imm8[3:2])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshufi32x4"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_shuffle_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+SELECT2(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
+tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshufi64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_shuffle_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT2(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
+tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshufi64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_shuffle_i64x2">
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst".</description>
+	<operation>
+SELECT2(src, control){
+	CASE(control[0])
+	0:	tmp[127:0] := src[127:0]
+	1:	tmp[127:0] := src[255:128]
+	ESAC
+	RETURN tmp[127:0]
+}
+
+dst[127:0] := SELECT2(a[255:0], imm8[1:0])
+dst[255:128] := SELECT2(b[255:0], imm8[3:2])
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshufi64x2"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshufpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
+tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshufpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vshufpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_shuffle_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vshufpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshufps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
+tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
+tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
+tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vshufps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vshufps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_shuffle_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<parameter varname="imm8" type="const int"/>
+	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+SELECT4(src, control){
+	CASE(control[1:0])
+	0:	tmp[31:0] := src[31:0]
+	1:	tmp[31:0] := src[63:32]
+	2:	tmp[31:0] := src[95:64]
+	3:	tmp[31:0] := src[127:96]
+	ESAC
+	RETURN tmp[31:0]
+}
+
+tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
+tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
+tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
+tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vshufps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vsqrtpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vsqrtpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vsqrtpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_sqrt_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := SQRT(a[i+63:i])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vsqrtpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vsqrtps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vsqrtps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vsqrtps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_sqrt_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := SQRT(a[i+31:i])
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vsqrtps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vsubpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vsubpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vsubpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_sub_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] - b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vsubpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vsubps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vsubps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vsubps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_sub_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] - b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI	
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vsubps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vunpckhpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vunpckhpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vunpckhpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_unpackhi_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[127:64] 
+	dst[127:64] := src2[127:64] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vunpckhpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vunpckhps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vunpckhps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vunpckhps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_unpackhi_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[95:64] 
+	dst[63:32] := src2[95:64] 
+	dst[95:64] := src1[127:96] 
+	dst[127:96] := src2[127:96] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vunpckhps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vunpcklpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vunpcklpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vunpcklpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_unpacklo_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
+	dst[63:0] := src1[63:0] 
+	dst[127:64] := src2[63:0] 
+	RETURN dst[127:0]
+}
+
+tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := tmp_dst[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vunpcklpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vunpcklps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
+
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vunpcklps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vunpcklps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_unpacklo_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512F</CPUID>
+	<category>Miscellaneous</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). </description>
+	<operation>
+INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
+	dst[31:0] := src1[31:0] 
+	dst[63:32] := src2[31:0] 
+	dst[95:64] := src1[63:32] 
+	dst[127:96] := src2[63:32] 
+	RETURN dst[127:0]
+}	
+
+tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
+
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := tmp_dst[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vunpcklps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_mask_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vxorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256d" name="_mm256_maskz_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256d"/>
+	<parameter varname="b" type="__m256d"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vxorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_mask_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vxorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_maskz_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vxorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512d" name="_mm512_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512d"/>
+	<parameter varname="b" type="__m512d"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vxorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_mask_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128d"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := src[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vxorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128d" name="_mm_maskz_xor_pd">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128d"/>
+	<parameter varname="b" type="__m128d"/>
+	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vxorpd"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_mask_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m256"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vxorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256" name="_mm256_maskz_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256"/>
+	<parameter varname="b" type="__m256"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vxorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_mask_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m512"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vxorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_maskz_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vxorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512" name="_mm512_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="a" type="__m512"/>
+	<parameter varname="b" type="__m512"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
+	</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*32
+	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vxorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_mask_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="src" type="__m128"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). </description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := src[i+31:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vxorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128" name="_mm_maskz_xor_ps">
+	<type>Floating Point</type>
+	<CPUID>AVX512VL</CPUID>
+	<CPUID>AVX512DQ</CPUID>
+	<category>Logical</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128"/>
+	<parameter varname="b" type="__m128"/>
+	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*32
+	IF k[j]
+		dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
+	ELSE
+		dst[i+31:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vxorps"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmadd52luq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmadd52luq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmadd52luq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="c" type="__m256i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmadd52luq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="c" type="__m256i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmadd52luq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="c" type="__m256i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmadd52luq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="c" type="__m128i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmadd52luq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="c" type="__m128i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmadd52luq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_madd52lo_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="c" type="__m128i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmadd52luq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmadd52huq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmadd52huq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<parameter varname="c" type="__m512i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 7
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmadd52huq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="c" type="__m256i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmadd52huq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="c" type="__m256i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmadd52huq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<parameter varname="c" type="__m256i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 3
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmadd52huq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="c" type="__m128i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+	dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmadd52huq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="c" type="__m128i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := a[i+63:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmadd52huq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_madd52hi_epu64">
+	<CPUID>AVX512IFMA52</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Arithmetic</category>
+	<parameter varname="k" type="__mmask8"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<parameter varname="c" type="__m128i"/>
+	<description>Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 1
+	i := j*64
+	IF k[j]
+		tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
+		dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
+	ELSE
+		dst[i+63:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmadd52huq"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_multishift_epi64_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 7
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		dst[q+j*8+7:q+j*8] := tmp8[7:0]
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmultishiftqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_multishift_epi64_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 7
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmultishiftqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_multishift_epi64_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 7
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpmultishiftqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_multishift_epi64_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 3
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		dst[q+j*8+7:q+j*8] := tmp8[7:0]
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmultishiftqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_multishift_epi64_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 3
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmultishiftqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_multishift_epi64_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 3
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpmultishiftqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_multishift_epi64_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst".</description>
+	<operation>
+FOR i := 0 to 1
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		dst[q+j*8+7:q+j*8] := tmp8[7:0]
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmultishiftqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_multishift_epi64_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 1
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmultishiftqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_multishift_epi64_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Bit Manipulation</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR i := 0 to 1
+	q := i * 64
+	FOR j := 0 to 7
+		tmp8 := 0
+		ctrl := a[q+j*8+7:q+j*8] &amp; 63
+		FOR l := 0 to 7
+			tmp8[l] := b[q+((ctrl+l) &amp; 63)]
+		ENDFOR
+		IF k[i*8+j]
+			dst[q+j*8+7:q+j*8] := tmp8[7:0]
+		ELSE
+			dst[q+j*8+7:q+j*8] := 0
+		FI
+	ENDFOR
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpmultishiftqb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_permutexvar_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	id := idx[i+5:i]*8
+	dst[i+7:i] := a[id+7:id]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_permutexvar_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	id := idx[i+5:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_permutexvar_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="a" type="__m512i"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	id := idx[i+5:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_permutexvar_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	id := idx[i+4:i]*8
+	dst[i+7:i] := a[id+7:id]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_permutexvar_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	id := idx[i+4:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_permutexvar_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="a" type="__m256i"/>
+	<description>Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	id := idx[i+4:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_permutexvar_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	id := idx[i+3:i]*8
+	dst[i+7:i] := a[id+7:id]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_permutexvar_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="src" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	id := idx[i+3:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := src[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_permutexvar_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="a" type="__m128i"/>
+	<description>Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	id := idx[i+3:i]*8
+	IF k[j]
+		dst[i+7:i] := a[id+7:id]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermb"/>
+	<header>immintrin.h</header>
+</intrinsic>
+
+
+
+
+
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	off := 8*idx[i+5:i]
+	dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermi2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+5:i]
+		dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermt2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_mask2_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+5:i]
+		dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermi2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m512i" name="_mm512_maskz_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask64"/>
+	<parameter varname="a" type="__m512i"/>
+	<parameter varname="idx" type="__m512i"/>
+	<parameter varname="b" type="__m512i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 63
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+5:i]
+		dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:512] := 0
+	</operation>
+	<instruction name="vpermi2b"/>
+	<instruction name="vpermt2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	off := 8*idx[i+4:i]
+	dst[i+7:i] := idx[i+6] ? b[off+5:off] : a[off+7:off]
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+4:i]
+		dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermt2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_mask2_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+4:i]
+		dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m256i" name="_mm256_maskz_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask32"/>
+	<parameter varname="a" type="__m256i"/>
+	<parameter varname="idx" type="__m256i"/>
+	<parameter varname="b" type="__m256i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 31
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+4:i]
+		dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:256] := 0
+	</operation>
+	<instruction name="vpermi2b"/>
+	<instruction name="vpermt2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst".</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	off := 8*idx[i+3:i]
+	dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+3:i]
+		dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermt2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_mask2_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+3:i]
+		dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := a[i+7:i]
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+<intrinsic tech="AVX-512" rettype="__m128i" name="_mm_maskz_permutex2var_epi8">
+	<CPUID>AVX512VBMI</CPUID>
+	<CPUID>AVX512VL</CPUID>
+	<category>Swizzle</category>
+	<parameter varname="k" type="__mmask16"/>
+	<parameter varname="a" type="__m128i"/>
+	<parameter varname="idx" type="__m128i"/>
+	<parameter varname="b" type="__m128i"/>
+	<description>Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set).</description>
+	<operation>
+FOR j := 0 to 15
+	i := j*8
+	IF k[j]
+		off := 8*idx[i+3:i]
+		dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
+	ELSE
+		dst[i+7:i] := 0
+	FI
+ENDFOR
+dst[MAX:128] := 0
+	</operation>
+	<instruction name="vpermi2b"/>
+	<instruction name="vpermt2b"/>
+	<header>immintrin.h</header>
+</intrinsic>
+</intrinsics_list>