Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

non-temporal stores: use inline assembly #1541

Merged
merged 3 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions crates/core_arch/src/x86/avx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1718,7 +1718,12 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
intrinsics::nontemporal_store(mem_addr, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(ymm_reg) a,
options(nostack, preserves_flags),
);
}

/// Moves double-precision values from a 256-bit vector of `[4 x double]`
Expand All @@ -1741,7 +1746,12 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
intrinsics::nontemporal_store(mem_addr as *mut __m256d, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(ymm_reg) a,
options(nostack, preserves_flags),
);
}

/// Moves single-precision floating point values from a 256-bit vector
Expand All @@ -1765,7 +1775,12 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
intrinsics::nontemporal_store(mem_addr as *mut __m256, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(ymm_reg) a,
options(nostack, preserves_flags),
);
}

/// Computes the approximate reciprocal of packed single-precision (32-bit)
Expand Down
25 changes: 20 additions & 5 deletions crates/core_arch/src/x86/avx512f.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28014,7 +28014,12 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) ->
#[cfg_attr(test, assert_instr(vmovntps))]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
intrinsics::nontemporal_store(mem_addr as *mut __m512, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(zmm_reg) a,
options(nostack, preserves_flags),
);
}

/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Expand All @@ -28035,7 +28040,12 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
intrinsics::nontemporal_store(mem_addr as *mut __m512d, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(zmm_reg) a,
options(nostack, preserves_flags),
);
}

/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Expand All @@ -28056,7 +28066,12 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) {
intrinsics::nontemporal_store(mem_addr as *mut __m512i, a);
crate::arch::asm!(
"vmovntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(zmm_reg) a,
options(nostack, preserves_flags),
);
}

/// Sets packed 32-bit integers in `dst` with the supplied values.
Expand Down Expand Up @@ -54373,9 +54388,9 @@ mod tests {

#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_stream_ps() {
#[repr(align(32))]
#[repr(align(64))]
struct Memory {
pub data: [f32; 16],
pub data: [f32; 16], // 64 bytes
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test should have failed many times already. The only explanation I have for why that did not happen is that maybe LLVM optimizes away the entire test...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe the whole stack frame gets 64-byte aligned, since there are __m512 values involved.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If that happened it would also happen with this PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The stack frame is probably different because you are missing the nostack option on the inline assembly. In fact these assembly blocks should include both nostack and preserves_flags.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure the test was just buggy before this PR. It doesn't actually ensure that the data is properly aligned.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact these assembly blocks should include both nostack and preserves_flags.

I have added the options in the last commit.

}
let a = _mm512_set1_ps(7.0);
let mut mem = Memory { data: [-1.0; 16] };
Expand Down
2 changes: 1 addition & 1 deletion crates/core_arch/src/x86/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#[allow(unused_imports)]
use crate::marker::Sized;
use crate::{intrinsics, mem::transmute};
use crate::mem::transmute;

#[macro_use]
mod macros;
Expand Down
7 changes: 6 additions & 1 deletion crates/core_arch/src/x86/sse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2002,7 +2002,12 @@ extern "C" {
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
intrinsics::nontemporal_store(mem_addr as *mut __m128, a);
crate::arch::asm!(
"movntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(xmm_reg) a,
options(nostack, preserves_flags),
);
}

#[cfg(test)]
Expand Down
25 changes: 20 additions & 5 deletions crates/core_arch/src/x86/sse2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1327,11 +1327,16 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
///
/// See [`_mm_sfence`] for details.
#[inline]
#[target_feature(enable = "sse2")]
#[target_feature(enable = "sse,sse2")]
#[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
intrinsics::nontemporal_store(mem_addr, a);
crate::arch::asm!(
"movntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(xmm_reg) a,
options(nostack, preserves_flags),
);
}

/// Stores a 32-bit integer value in the specified memory location.
Expand All @@ -1353,7 +1358,12 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
#[cfg_attr(test, assert_instr(movnti))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
intrinsics::nontemporal_store(mem_addr, a);
crate::arch::asm!(
"movnti [{mem_addr}], {a:e}", // `:e` for 32bit value
mem_addr = in(reg) mem_addr,
a = in(reg) a,
options(nostack, preserves_flags),
);
}

/// Returns a vector where the low element is extracted from `a` and its upper
Expand Down Expand Up @@ -2543,12 +2553,17 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
///
/// See [`_mm_sfence`] for details.
#[inline]
#[target_feature(enable = "sse2")]
#[target_feature(enable = "sse,sse2")]
#[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd
#[stable(feature = "simd_x86", since = "1.27.0")]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
intrinsics::nontemporal_store(mem_addr as *mut __m128d, a);
crate::arch::asm!(
"movntps [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(xmm_reg) a,
options(nostack, preserves_flags),
);
}

/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
Expand Down
12 changes: 7 additions & 5 deletions crates/core_arch/src/x86_64/sse2.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
//! `x86_64`'s Streaming SIMD Extensions 2 (SSE2)

use crate::{
core_arch::x86::*,
intrinsics::{self, simd::*},
};
use crate::{core_arch::x86::*, intrinsics::simd::*};

#[cfg(test)]
use stdarch_test::assert_instr;
Expand Down Expand Up @@ -81,7 +78,12 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
#[cfg_attr(test, assert_instr(movnti))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
intrinsics::nontemporal_store(mem_addr, a);
crate::arch::asm!(
"movnti [{mem_addr}], {a}",
mem_addr = in(reg) mem_addr,
a = in(reg) a,
options(nostack, preserves_flags),
);
}

/// Returns a vector whose lowest element is `a` and all higher elements are
Expand Down
Loading