diff --git a/.github/.cspell/project-dictionary.txt b/.github/.cspell/project-dictionary.txt index 5d458ad8..8c0ed8fb 100644 --- a/.github/.cspell/project-dictionary.txt +++ b/.github/.cspell/project-dictionary.txt @@ -26,6 +26,7 @@ cmovge cmovl cmpd cmpld +cmpw cmpxchg cpsid cpsie @@ -108,6 +109,7 @@ movq mpidr mspdebug mstatus +mstatush mvfr negs neoverse @@ -123,6 +125,7 @@ pointee prctl prefetcher PRIMASK +pstq quadword RAII rcpc diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 1ceb2668..c45d2406 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -18,6 +18,7 @@ portable-atomic/ │ ├── imp/ │ │ ├── atomic128/ -- 128-bit atomic implementations on 64-bit architectures (mainly by asm) │ │ ├── atomic64/ -- 64-bit atomic implementations on 32-bit architectures (mainly by asm) +│ │ ├── avr.rs -- atomic implementation for AVR (by asm) │ │ ├── core_atomic.rs -- wrapper for core::sync::atomic types │ │ ├── detect/ -- Run-time CPU feature detection implementations used for outline-atomics │ │ ├── fallback/ -- fallback implementation based on global locks diff --git a/build.rs b/build.rs index 4b5c2fea..0861dd97 100644 --- a/build.rs +++ b/build.rs @@ -377,15 +377,14 @@ fn main() { if !version.probe(83, 2024, 9, 27) || needs_target_feature_fallback(&version, None) { let target_endian = env::var("CARGO_CFG_TARGET_ENDIAN").expect("CARGO_CFG_TARGET_ENDIAN not set"); - // powerpc64le is pwr8+ by default https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702 + // powerpc64le is pwr8 by default https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702 // See also https://github.com/rust-lang/rust/issues/59932 - let mut has_pwr8_features = target_endian == "little"; - // https://github.com/llvm/llvm-project/commit/549e118e93c666914a1045fde38a2cac33e1e445 + let mut pwr8_features = target_endian == "little"; if let Some(cpu) = &target_cpu() { if let Some(mut cpu_version) = strip_prefix(cpu, "pwr") { cpu_version = strip_suffix(cpu_version, "x").unwrap_or(cpu_version); // for pwr5x and pwr6x if let Ok(cpu_version) = cpu_version.parse::() { - has_pwr8_features = cpu_version >= 8; + pwr8_features = cpu_version >= 8; } } else { // https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702 @@ -393,11 +392,12 @@ fn main() { // On the minimum external LLVM version of the oldest rustc version which we can use asm_experimental_arch // on this target (see CI config for more), "future" is based on pwr10 features. // https://github.com/llvm/llvm-project/blob/llvmorg-12.0.0/llvm/lib/Target/PowerPC/PPC.td#L370 - has_pwr8_features = cpu == "ppc64le" || cpu == "future"; + pwr8_features = cpu == "future" || cpu == "ppc64le"; } } + // power8 features: https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L409 // lqarx and stqcx. - target_feature_fallback("quadword-atomics", has_pwr8_features); + target_feature_fallback("quadword-atomics", pwr8_features); } } "s390x" => { @@ -421,12 +421,14 @@ fn main() { } // As of rustc 1.80, target_feature "fast-serialization"/"load-store-on-cond"/"distinct-ops"/"miscellaneous-extensions-3" is not available on rustc side: // https://github.com/rust-lang/rust/blob/1.80.0/compiler/rustc_target/src/target_features.rs + // arch9 features: https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/SystemZ/SystemZFeatures.td#L103 // bcr 14,0 target_feature_fallback("fast-serialization", arch9_features); // {l,st}oc{,g}{,r} target_feature_fallback("load-store-on-cond", arch9_features); // {al,sl,n,o,x}{,g}rk target_feature_fallback("distinct-ops", arch9_features); + // arch13 features: https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/SystemZ/SystemZFeatures.td#L301 // nand (nnr{,g}k), select (sel{,g}r), etc. target_feature_fallback("miscellaneous-extensions-3", arch13_features); } diff --git a/src/imp/atomic128/powerpc64.rs b/src/imp/atomic128/powerpc64.rs index b87d1011..df877aee 100644 --- a/src/imp/atomic128/powerpc64.rs +++ b/src/imp/atomic128/powerpc64.rs @@ -3,13 +3,15 @@ /* 128-bit atomic implementation on PowerPC64. -powerpc64 on pwr8+ support 128-bit atomics (load/store/LL/SC): -https://github.com/llvm/llvm-project/commit/549e118e93c666914a1045fde38a2cac33e1e445 -https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/test/CodeGen/PowerPC/atomics-i128-ldst.ll -https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/test/CodeGen/PowerPC/atomics-i128.ll +This architecture provides the following 128-bit atomic instructions: -powerpc64le is pwr8+ by default https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702 -See also https://github.com/rust-lang/rust/issues/59932 +- lq/stq: load/store (ISA 2.07 or later, included in the Linux Compliancy subset and AIX Compliancy subset) +- lqarx/stqcx.: LL/SC (ISA 2.07 or later, included in the Linux Compliancy subset and AIX Compliancy subset) +- plq/pstq: load/store (ISA 3.1 or later, included in the Linux Compliancy subset and AIX Compliancy subset) + +See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and +detailed description of the atomic and synchronize instructions in this architecture: +https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#powerpc Note that we do not separate LL and SC into separate functions, but handle them within a single asm block. This is because it is theoretically possible @@ -20,13 +22,16 @@ Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't this module and use intrinsics.rs instead. Refs: -- Power ISA https://openpowerfoundation.org/specifications/isa -- AIX Assembler language reference https://www.ibm.com/docs/en/aix/7.3?topic=aix-assembler-language-reference -- atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit +- Power ISA + https://openpowerfoundation.org/specifications/isa +- AIX Assembler language reference + https://www.ibm.com/docs/en/aix/7.3?topic=aix-assembler-language-reference +- atomic-maybe-uninit + https://github.com/taiki-e/atomic-maybe-uninit Generated asm: -- powerpc64 (pwr8) https://godbolt.org/z/KPfsKd81K -- powerpc64le https://godbolt.org/z/5dcbKqdne +- powerpc64 (pwr8) https://godbolt.org/z/TjKsPbWc6 +- powerpc64le https://godbolt.org/z/5WqPGhb3Y */ include!("macros.rs"); @@ -91,9 +96,10 @@ macro_rules! debug_assert_pwr8 { // This is similar to #[target_feature(enable = "quadword-atomics")], except that there are // no compiler guarantees regarding (un)inlining, and the scope is within an asm // block rather than a function. We use this directive because #[target_feature(enable = "quadword-atomics")] -// is not supported as of Rust 1.70-nightly. +// is unstable and unavailable on old nightly and incompatible with rustc_codegen_cranelift: +// https://github.com/rust-lang/rustc_codegen_cranelift/issues/1400#issuecomment-1774599775 // -// start_pwr8 and end_pwr8 must be used in pairs. +// Note: start_pwr8 and end_pwr8 must be used in pairs. // // Note: If power8 instructions are not available at compile-time, we must guarantee that // the function that uses it is not inlined into a function where it is not @@ -118,19 +124,42 @@ macro_rules! atomic_rmw { ($op:ident, $order:ident) => { match $order { Ordering::Relaxed => $op!("", ""), - Ordering::Acquire => $op!("lwsync", ""), + Ordering::Acquire => $op!("isync", ""), Ordering::Release => $op!("", "lwsync"), - Ordering::AcqRel => $op!("lwsync", "lwsync"), - Ordering::SeqCst => $op!("lwsync", "sync"), + Ordering::AcqRel => $op!("isync", "lwsync"), + Ordering::SeqCst => $op!("isync", "sync"), _ => unreachable!(), } }; } +macro_rules! atomic_cas { + ($op:ident, $success:ident, $failure:ident) => { + if $failure == Ordering::Relaxed { + match $success { + Ordering::Relaxed => $op!("", "", ""), + Ordering::Acquire => $op!("", "isync", ""), + Ordering::Release => $op!("", "", "lwsync"), + Ordering::AcqRel => $op!("", "isync", "lwsync"), + Ordering::SeqCst => $op!("", "isync", "sync"), + _ => unreachable!(), + } + } else { + let order = crate::utils::upgrade_success_ordering($success, $failure); + match order { + // Relaxed and Release are covered in $failure == Relaxed branch. + Ordering::Acquire => $op!("isync", "", ""), + Ordering::AcqRel => $op!("isync", "", "lwsync"), + Ordering::SeqCst => $op!("isync", "", "sync"), + _ => unreachable!(), + } + } + }; +} // Extracts and checks the EQ bit of cr0. -#[inline(always)] -fn extract_cr0(r: u64) -> bool { - r & 0x20000000 != 0 +#[inline] +fn test_cr0_eq(cr: u64) -> bool { + cr & 0x20000000 != 0 } // If quadword-atomics is available at compile-time, we can always use pwr8_fn. @@ -194,31 +223,29 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 { unsafe fn atomic_load_pwr8(src: *mut u128, order: Ordering) -> u128 { debug_assert!(src as usize % 16 == 0); debug_assert_pwr8!(); + let (out_hi, out_lo); // SAFETY: the caller must uphold the safety contract. // // Refs: Section 3.3.4 "Fixed Point Load and Store Quadword Instructions" of Power ISA 3.1C Book I unsafe { - let (out_hi, out_lo); macro_rules! atomic_load_acquire { ($release:tt) => { asm!( start_pwr8!(), $release, - "lq %r4, 0({src})", - // Lightweight acquire sync - // Refs: https://github.com/boostorg/atomic/blob/boost-1.79.0/include/boost/atomic/detail/core_arch_ops_gcc_ppc.hpp#L47-L62 - "cmpd %cr7, %r4, %r4", - "bne- %cr7, 2f", - "2:", - "isync", + "lq %r4, 0({src})", // atomic { r4:r5 = *src } + "cmpw %r4, %r4", // if r4 == r4 { cr0.EQ = 1 } else { cr0.EQ = 0 } + "bne- %cr0, 2f", // if unlikely(cr0.EQ == 0) { jump 'never } + "2:", // 'never: + "isync", // fence (works in combination with a branch that depends on the loaded value) end_pwr8!(), src = in(reg_nonzero) ptr_reg!(src), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. // We cannot use r1 (sp) and r2 (system reserved), so start with r4 or grater. out("r4") out_hi, out("r5") out_lo, - out("cr7") _, + out("cr0") _, options(nostack, preserves_flags), ) }; @@ -227,7 +254,7 @@ unsafe fn atomic_load_pwr8(src: *mut u128, order: Ordering) -> u128 { Ordering::Relaxed => { asm!( start_pwr8!(), - "lq %r4, 0({src})", + "lq %r4, 0({src})", // atomic { r4:r5 = *src } end_pwr8!(), src = in(reg_nonzero) ptr_reg!(src), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. @@ -306,18 +333,18 @@ unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) { unsafe fn atomic_store_pwr8(dst: *mut u128, val: u128, order: Ordering) { debug_assert!(dst as usize % 16 == 0); debug_assert_pwr8!(); + let val = U128 { whole: val }; // SAFETY: the caller must uphold the safety contract. // // Refs: Section 3.3.4 "Fixed Point Load and Store Quadword Instructions" of Power ISA 3.1C Book I unsafe { - let val = U128 { whole: val }; macro_rules! atomic_store { ($release:tt) => { asm!( start_pwr8!(), - $release, - "stq %r4, 0({dst})", + $release, // fence + "stq %r4, 0({dst})", // atomic { *dst = r4:r5 } end_pwr8!(), dst = in(reg_nonzero) ptr_reg!(dst), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. @@ -345,21 +372,84 @@ unsafe fn atomic_compare_exchange( success: Ordering, failure: Ordering, ) -> Result { - let success = crate::utils::upgrade_success_ordering(success, failure); - #[cfg(any( target_feature = "quadword-atomics", portable_atomic_target_feature = "quadword-atomics", ))] // SAFETY: the caller must uphold the safety contract. // cfg guarantees that quadword atomics instructions are available at compile-time. - let (prev, ok) = unsafe { atomic_compare_exchange_pwr8(dst, old, new, success) }; + let (prev, ok) = unsafe { atomic_compare_exchange_pwr8(dst, old, new, success, failure) }; #[cfg(not(any( target_feature = "quadword-atomics", portable_atomic_target_feature = "quadword-atomics", )))] // SAFETY: the caller must uphold the safety contract. - let (prev, ok) = unsafe { atomic_compare_exchange_ifunc(dst, old, new, success) }; + let (prev, ok) = { + fn_alias! { + // inline(never) is just a hint and also not strictly necessary + // because we use ifunc helper macro, but used for clarity. + #[inline(never)] + unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool); + pwr8_relaxed_fn = atomic_compare_exchange_pwr8(Ordering::Relaxed, Ordering::Relaxed); + pwr8_acquire_fn = atomic_compare_exchange_pwr8(Ordering::Acquire, Ordering::Acquire); + pwr8_release_fn = atomic_compare_exchange_pwr8(Ordering::Release, Ordering::Relaxed); + pwr8_acqrel_fn = atomic_compare_exchange_pwr8(Ordering::AcqRel, Ordering::Acquire); + pwr8_seqcst_fn = atomic_compare_exchange_pwr8(Ordering::SeqCst, Ordering::SeqCst); + } + // SAFETY: the caller must uphold the safety contract. + // we only calls pwr8_fn if quadword-atomics is available. + unsafe { + let success = crate::utils::upgrade_success_ordering(success, failure); + match success { + Ordering::Relaxed => { + ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { + if detect::detect().has_quadword_atomics() { + pwr8_relaxed_fn + } else { + fallback::atomic_compare_exchange_non_seqcst + } + }) + } + Ordering::Acquire => { + ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { + if detect::detect().has_quadword_atomics() { + pwr8_acquire_fn + } else { + fallback::atomic_compare_exchange_non_seqcst + } + }) + } + Ordering::Release => { + ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { + if detect::detect().has_quadword_atomics() { + pwr8_release_fn + } else { + fallback::atomic_compare_exchange_non_seqcst + } + }) + } + Ordering::AcqRel => { + ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { + if detect::detect().has_quadword_atomics() { + pwr8_acqrel_fn + } else { + fallback::atomic_compare_exchange_non_seqcst + } + }) + } + Ordering::SeqCst => { + ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) { + if detect::detect().has_quadword_atomics() { + pwr8_seqcst_fn + } else { + fallback::atomic_compare_exchange_seqcst + } + }) + } + _ => unreachable!(), + } + } + }; if ok { Ok(prev) } else { @@ -371,36 +461,37 @@ unsafe fn atomic_compare_exchange_pwr8( dst: *mut u128, old: u128, new: u128, - order: Ordering, + success: Ordering, + failure: Ordering, ) -> (u128, bool) { debug_assert!(dst as usize % 16 == 0); debug_assert_pwr8!(); + let old = U128 { whole: old }; + let new = U128 { whole: new }; + let (mut prev_hi, mut prev_lo); + let mut r; // SAFETY: the caller must uphold the safety contract. // // Refs: Section 4.6.2.2 "128-bit Load And Reserve and Store Conditional Instructions" of Power ISA 3.1C Book II unsafe { - let old = U128 { whole: old }; - let new = U128 { whole: new }; - let (mut prev_hi, mut prev_lo); - let mut r; macro_rules! cmpxchg { - ($acquire:tt, $release:tt) => { + ($acquire_always:tt, $acquire_success:tt, $release:tt) => { asm!( start_pwr8!(), - $release, - "2:", - "lqarx %r8, 0, {dst}", - "xor {tmp_lo}, %r9, {old_lo}", - "xor {tmp_hi}, %r8, {old_hi}", - "or. {tmp_lo}, {tmp_lo}, {tmp_hi}", - "bne %cr0, 3f", // jump if compare failed - "stqcx. %r6, 0, {dst}", - "bne %cr0, 2b", // continue loop if store failed - "3:", - // if compare failed EQ bit is cleared, if stqcx succeeds EQ bit is set. - "mfcr {tmp_lo}", - $acquire, + $release, // fence + "2:", // 'retry: + "lqarx %r8, 0, {dst}", // atomic { RESERVE = (dst, 16); r8:r9 = *dst } + "xor {tmp_lo}, %r9, {old_lo}", // tmp_lo = r9 ^ old_lo + "xor {tmp_hi}, %r8, {old_hi}", // tmp_hi = r8 ^ old_hi + "or. {tmp_lo}, {tmp_lo}, {tmp_hi}", // tmp_lo |= tmp_hi; if tmp_lo == 0 { cr0.EQ = 1 } else { cr0.EQ = 0 } + "bne %cr0, 3f", // if cr0.EQ == 0 { jump 'cmp-fail } + "stqcx. %r6, 0, {dst}", // atomic { if RESERVE == (dst, 16) { *dst = r6:r7; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } + "bne %cr0, 2b", // if cr0.EQ == 0 { jump 'retry } + $acquire_success, // fence + "3:", // 'cmp-fail: + $acquire_always, // fence + "mfcr {tmp_lo}", // tmp_lo = zero_extend(cr) end_pwr8!(), dst = in(reg_nonzero) ptr_reg!(dst), old_hi = in(reg) old.pair.hi, @@ -418,8 +509,9 @@ unsafe fn atomic_compare_exchange_pwr8( ) }; } - atomic_rmw!(cmpxchg, order); - (U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole, extract_cr0(r)) + atomic_cas!(cmpxchg, success, failure); + // if compare failed EQ bit is cleared, if store succeeds EQ bit is set. + (U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole, test_cr0_eq(r)) } } @@ -441,11 +533,9 @@ unsafe fn atomic_compare_exchange_weak( success: Ordering, failure: Ordering, ) -> Result { - let success = crate::utils::upgrade_success_ordering(success, failure); - // SAFETY: the caller must uphold the safety contract. // cfg guarantees that quadword atomics instructions are available at compile-time. - let (prev, ok) = unsafe { atomic_compare_exchange_weak_pwr8(dst, old, new, success) }; + let (prev, ok) = unsafe { atomic_compare_exchange_weak_pwr8(dst, old, new, success, failure) }; if ok { Ok(prev) } else { @@ -461,34 +551,35 @@ unsafe fn atomic_compare_exchange_weak_pwr8( dst: *mut u128, old: u128, new: u128, - order: Ordering, + success: Ordering, + failure: Ordering, ) -> (u128, bool) { debug_assert!(dst as usize % 16 == 0); debug_assert_pwr8!(); + let old = U128 { whole: old }; + let new = U128 { whole: new }; + let (mut prev_hi, mut prev_lo); + let mut r; // SAFETY: the caller must uphold the safety contract. // // Refs: Section 4.6.2.2 "128-bit Load And Reserve and Store Conditional Instructions" of Power ISA 3.1C Book II unsafe { - let old = U128 { whole: old }; - let new = U128 { whole: new }; - let (mut prev_hi, mut prev_lo); - let mut r; macro_rules! cmpxchg_weak { - ($acquire:tt, $release:tt) => { + ($acquire_always:tt, $acquire_success:tt, $release:tt) => { asm!( start_pwr8!(), - $release, - "lqarx %r8, 0, {dst}", - "xor {tmp_lo}, %r9, {old_lo}", - "xor {tmp_hi}, %r8, {old_hi}", - "or. {tmp_lo}, {tmp_lo}, {tmp_hi}", - "bne %cr0, 3f", // jump if compare failed - "stqcx. %r6, 0, {dst}", - "3:", - // if compare or stqcx failed EQ bit is cleared, if stqcx succeeds EQ bit is set. - "mfcr {tmp_lo}", - $acquire, + $release, // fence + "lqarx %r8, 0, {dst}", // atomic { RESERVE = (dst, 16); r8:r9 = *dst } + "xor {tmp_lo}, %r9, {old_lo}", // tmp_lo = r9 ^ old_lo + "xor {tmp_hi}, %r8, {old_hi}", // tmp_hi = r8 ^ old_hi + "or. {tmp_lo}, {tmp_lo}, {tmp_hi}", // tmp_lo |= tmp_hi; if tmp_lo == 0 { cr0.EQ = 1 } else { cr0.EQ = 0 } + "bne %cr0, 3f", // if cr0.EQ == 0 { jump 'cmp-fail } + "stqcx. %r6, 0, {dst}", // atomic { if RESERVE == (dst, 16) { *dst = r6:r7; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } + $acquire_success, // fence + "3:", // 'cmp-fail: + $acquire_always, // fence + "mfcr {tmp_lo}", // tmp_lo = zero_extend(cr) end_pwr8!(), dst = in(reg_nonzero) ptr_reg!(dst), old_hi = in(reg) old.pair.hi, @@ -506,8 +597,9 @@ unsafe fn atomic_compare_exchange_weak_pwr8( ) }; } - atomic_rmw!(cmpxchg_weak, order); - (U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole, extract_cr0(r)) + atomic_cas!(cmpxchg_weak, success, failure); + // if compare or store failed EQ bit is cleared, if store succeeds EQ bit is set. + (U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole, test_cr0_eq(r)) } } @@ -516,21 +608,21 @@ unsafe fn atomic_compare_exchange_weak_pwr8( unsafe fn atomic_swap_pwr8(dst: *mut u128, val: u128, order: Ordering) -> u128 { debug_assert!(dst as usize % 16 == 0); debug_assert_pwr8!(); + let val = U128 { whole: val }; + let (mut prev_hi, mut prev_lo); // SAFETY: the caller must uphold the safety contract. unsafe { - let val = U128 { whole: val }; - let (mut prev_hi, mut prev_lo); macro_rules! swap { ($acquire:tt, $release:tt) => { asm!( start_pwr8!(), - $release, - "2:", - "lqarx %r6, 0, {dst}", - "stqcx. %r8, 0, {dst}", - "bne %cr0, 2b", - $acquire, + $release, // fence + "2:", // 'retry: + "lqarx %r6, 0, {dst}", // atomic { RESERVE = (dst, 16); r6:r7 = *dst } + "stqcx. %r8, 0, {dst}", // atomic { if RESERVE == (dst, 16) { *dst = r8:r9; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } + "bne %cr0, 2b", // if cr0.EQ == 0 { jump 'retry } + $acquire, // fence end_pwr8!(), dst = in(reg_nonzero) ptr_reg!(dst), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. @@ -562,21 +654,22 @@ macro_rules! atomic_rmw_ll_sc_3 { unsafe fn $name(dst: *mut u128, val: u128, order: Ordering) -> u128 { debug_assert!(dst as usize % 16 == 0); debug_assert_pwr8!(); + let val = U128 { whole: val }; + let (mut prev_hi, mut prev_lo); + // SAFETY: the caller must uphold the safety contract. unsafe { - let val = U128 { whole: val }; - let (mut prev_hi, mut prev_lo); macro_rules! op { ($acquire:tt, $release:tt) => { asm!( start_pwr8!(), - $release, - "2:", - "lqarx %r6, 0, {dst}", + $release, // fence + "2:", // 'retry: + "lqarx %r6, 0, {dst}", // atomic { RESERVE = (dst, 16); r6:r7 = *dst } $($op)* - "stqcx. %r8, 0, {dst}", - "bne %cr0, 2b", - $acquire, + "stqcx. %r8, 0, {dst}", // atomic { if RESERVE == (dst, 16) { *dst = r8:r9; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } + "bne %cr0, 2b", // if cr0.EQ == 0 { jump 'retry } + $acquire, // fence end_pwr8!(), dst = in(reg_nonzero) ptr_reg!(dst), val_hi = in(reg) val.pair.hi, @@ -611,20 +704,21 @@ macro_rules! atomic_rmw_ll_sc_2 { unsafe fn $name(dst: *mut u128, order: Ordering) -> u128 { debug_assert!(dst as usize % 16 == 0); debug_assert_pwr8!(); + let (mut prev_hi, mut prev_lo); + // SAFETY: the caller must uphold the safety contract. unsafe { - let (mut prev_hi, mut prev_lo); macro_rules! op { ($acquire:tt, $release:tt) => { asm!( start_pwr8!(), - $release, - "2:", - "lqarx %r6, 0, {dst}", + $release, // fence + "2:", // 'retry: + "lqarx %r6, 0, {dst}", // atomic { RESERVE = (dst, 16); r6:r7 = *dst } $($op)* - "stqcx. %r8, 0, {dst}", - "bne %cr0, 2b", - $acquire, + "stqcx. %r8, 0, {dst}", // atomic { if RESERVE == (dst, 16) { *dst = r8:r9; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } + "bne %cr0, 2b", // if cr0.EQ == 0 { jump 'retry } + $acquire, // fence end_pwr8!(), dst = in(reg_nonzero) ptr_reg!(dst), $($reg)* @@ -648,71 +742,71 @@ macro_rules! atomic_rmw_ll_sc_2 { atomic_rmw_ll_sc_3! { atomic_add_pwr8, [out("xer") _,], - "addc %r9, {val_lo}, %r7", - "adde %r8, {val_hi}, %r6", + "addc %r9, {val_lo}, %r7", // r9 = val_lo + r7; xer.CA = carry + "adde %r8, {val_hi}, %r6", // r8 = val_hi + r6 + xer.CA } atomic_rmw_ll_sc_3! { atomic_sub_pwr8, [out("xer") _,], - "subc %r9, %r7, {val_lo}", - "subfe %r8, {val_hi}, %r6", + "subc %r9, %r7, {val_lo}", // r9 = val_lo - r7; xer.CA = borrow + "subfe %r8, {val_hi}, %r6", // r8 = val_hi - r6 - xer.CA } atomic_rmw_ll_sc_3! { atomic_and_pwr8, [], - "and %r9, {val_lo}, %r7", - "and %r8, {val_hi}, %r6", + "and %r9, {val_lo}, %r7", // r9 = val_lo & r7 + "and %r8, {val_hi}, %r6", // r8 = val_hi & r6 } atomic_rmw_ll_sc_3! { atomic_nand_pwr8, [], - "nand %r9, {val_lo}, %r7", - "nand %r8, {val_hi}, %r6", + "nand %r9, {val_lo}, %r7", // r9 = !(val_lo & r7) + "nand %r8, {val_hi}, %r6", // r8 = !(val_hi & r6) } atomic_rmw_ll_sc_3! { atomic_or_pwr8, [], - "or %r9, {val_lo}, %r7", - "or %r8, {val_hi}, %r6", + "or %r9, {val_lo}, %r7", // r9 = val_lo | r7 + "or %r8, {val_hi}, %r6", // r8 = val_hi | r6 } atomic_rmw_ll_sc_3! { atomic_xor_pwr8, [], - "xor %r9, {val_lo}, %r7", - "xor %r8, {val_hi}, %r6", + "xor %r9, {val_lo}, %r7", // r9 = val_lo ^ r7 + "xor %r8, {val_hi}, %r6", // r8 = val_hi ^ r6 } atomic_rmw_ll_sc_3! { atomic_max_pwr8, [out("cr1") _,], - "cmpld %r7, {val_lo}", // (unsigned) compare lo 64-bit, store result to cr0 - "iselgt %r9, %r7, {val_lo}", // select lo 64-bit based on GT bit in cr0 - "cmpd %cr1, %r6, {val_hi}", // (signed) compare hi 64-bit, store result to cr1 - "isel %r8, %r7, {val_lo}, 5", // select lo 64-bit based on GT bit in cr1 - "cmpld %r6, {val_hi}", // (unsigned) compare hi 64-bit, store result to cr0 - "iseleq %r9, %r9, %r8", // select lo 64-bit based on EQ bit in cr0 - "isel %r8, %r6, {val_hi}, 5", // select hi 64-bit based on GT bit in cr1 + "cmpld %r7, {val_lo}", // if r7(u) < val_lo(u) { cr0 = { LT: 1, ..0 } } else if r7(u) > val_lo(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } } + "iselgt %r9, %r7, {val_lo}", // if cr0.GT == 1 { r9 = r7 } else { r9 = val_lo } + "cmpd %cr1, %r6, {val_hi}", // if r6(i) < val_hi(i) { cr1 = { LT: 1, ..0 } } else if r6(i) > val_hi(i) { cr1 = { GT: 1, ..0 } } else { cr1 = { EQ: 1, ..0 } } + "isel %r8, %r7, {val_lo}, 5", // if cr1.GT == 1 { r8 = r7 } else { r8 = val_lo } + "cmpld %r6, {val_hi}", // if r6(u) < val_hi(u) { cr0 = { LT: 1, ..0 } } else if r6(u) > val_hi(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } } + "iseleq %r9, %r9, %r8", // if cr0.EQ == 1 { r9 = r9 } else { r9 = r8 } + "isel %r8, %r6, {val_hi}, 5", // if cr1.GT == 1 { r8 = r6 } else { r8 = val_hi } } atomic_rmw_ll_sc_3! { atomic_umax_pwr8, [], - "cmpld %r7, {val_lo}", // compare lo 64-bit, store result to cr0 - "iselgt %r9, %r7, {val_lo}", // select lo 64-bit based on GT bit in cr0 - "cmpld %r6, {val_hi}", // compare hi 64-bit, store result to cr0 - "iselgt %r8, %r7, {val_lo}", // select lo 64-bit based on GT bit in cr0 - "iseleq %r9, %r9, %r8", // select lo 64-bit based on EQ bit in cr0 - "iselgt %r8, %r6, {val_hi}", // select hi 64-bit based on GT bit in cr0 + "cmpld %r7, {val_lo}", // if r7(u) < val_lo(u) { cr0 = { LT: 1, ..0 } } else if r7(u) > val_lo(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } } + "iselgt %r9, %r7, {val_lo}", // if cr0.GT == 1 { r9 = r7 } else { r9 = val_lo } + "cmpld %r6, {val_hi}", // if r6(u) < val_hi(u) { cr0 = { LT: 1, ..0 } } else if r6(u) > val_hi(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } } + "iselgt %r8, %r7, {val_lo}", // if cr0.GT == 1 { r8 = r7 } else { r8 = val_lo } + "iseleq %r9, %r9, %r8", // if cr0.EQ == 1 { r9 = r9 } else { r9 = r8 } + "iselgt %r8, %r6, {val_hi}", // if cr0.GT == 1 { r8 = r6 } else { r8 = val_hi } } atomic_rmw_ll_sc_3! { atomic_min_pwr8, [out("cr1") _,], - "cmpld %r7, {val_lo}", // (unsigned) compare lo 64-bit, store result to cr0 - "isellt %r9, %r7, {val_lo}", // select lo 64-bit based on LT bit in cr0 - "cmpd %cr1, %r6, {val_hi}", // (signed) compare hi 64-bit, store result to cr1 - "isel %r8, %r7, {val_lo}, 4", // select lo 64-bit based on LT bit in cr1 - "cmpld %r6, {val_hi}", // (unsigned) compare hi 64-bit, store result to cr0 - "iseleq %r9, %r9, %r8", // select lo 64-bit based on EQ bit in cr0 - "isel %r8, %r6, {val_hi}, 4", // select hi 64-bit based on LT bit in cr1 + "cmpld %r7, {val_lo}", // if r7(u) < val_lo(u) { cr0 = { LT: 1, ..0 } } else if r7(u) > val_lo(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } } + "isellt %r9, %r7, {val_lo}", // if cr0.LT == 1 { r9 = r7 } else { r9 = val_lo } + "cmpd %cr1, %r6, {val_hi}", // if r6(i) < val_hi(i) { cr1 = { LT: 1, ..0 } } else if r6(i) > val_hi(i) { cr1 = { GT: 1, ..0 } } else { cr1 = { EQ: 1, ..0 } } + "isel %r8, %r7, {val_lo}, 4", // if cr1.LT == 1 { r8 = r7 } else { r8 = val_lo } + "cmpld %r6, {val_hi}", // if r6(u) < val_hi(u) { cr0 = { LT: 1, ..0 } } else if r6(u) > val_hi(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } } + "iseleq %r9, %r9, %r8", // if cr0.EQ == 1 { r9 = r9 } else { r9 = r8 } + "isel %r8, %r6, {val_hi}, 4", // if cr1.LT == 1 { r8 = r6 } else { r8 = val_hi } } atomic_rmw_ll_sc_3! { atomic_umin_pwr8, [], - "cmpld %r7, {val_lo}", // compare lo 64-bit, store result to cr0 - "isellt %r9, %r7, {val_lo}", // select lo 64-bit based on LT bit in cr0 - "cmpld %r6, {val_hi}", // compare hi 64-bit, store result to cr0 - "isellt %r8, %r7, {val_lo}", // select lo 64-bit based on LT bit in cr0 - "iseleq %r9, %r9, %r8", // select lo 64-bit based on EQ bit in cr0 - "isellt %r8, %r6, {val_hi}", // select hi 64-bit based on LT bit in cr0 + "cmpld %r7, {val_lo}", // if r7(u) < val_lo(u) { cr0 = { LT: 1, ..0 } } else if r7(u) > val_lo(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } } + "isellt %r9, %r7, {val_lo}", // if cr0.LT == 1 { r9 = r7 } else { r9 = val_lo } + "cmpld %r6, {val_hi}", // if r6(u) < val_hi(u) { cr0 = { LT: 1, ..0 } } else if r6(u) > val_hi(u) { cr0 = { GT: 1, ..0 } } else { cr0 = { EQ: 1, ..0 } } + "isellt %r8, %r7, {val_lo}", // if cr0.LT == 1 { r8 = r7 } else { r8 = val_lo } + "iseleq %r9, %r9, %r8", // if cr0.EQ == 1 { r9 = r9 } else { r9 = r8 } + "isellt %r8, %r6, {val_hi}", // if cr0.LT == 1 { r8 = r6 } else { r8 = val_hi } } #[inline] @@ -724,15 +818,15 @@ unsafe fn atomic_not_pwr8(dst: *mut u128, order: Ordering) -> u128 { #[cfg(not(portable_atomic_pre_llvm_16))] atomic_rmw_ll_sc_2! { atomic_neg_pwr8, [out("xer") _,], - "subfic %r9, %r7, 0", - "subfze %r8, %r6", + "subfic %r9, %r7, 0", // r9 = 0 - r7; xer.CA = borrow + "subfze %r8, %r6", // r8 = 0 - r6 - xer.CA } // LLVM 15 miscompiles subfic. #[cfg(portable_atomic_pre_llvm_16)] atomic_rmw_ll_sc_2! { atomic_neg_pwr8, [zero = in(reg) 0_u64, out("xer") _,], - "subc %r9, {zero}, %r7", - "subfze %r8, %r6", + "subc %r9, {zero}, %r7", // r9 = 0 - r7; xer.CA = borrow + "subfze %r8, %r6", // r8 = 0 - r6 - xer.CA } macro_rules! select_atomic_rmw { @@ -822,16 +916,6 @@ macro_rules! select_atomic_rmw { }; } -#[cfg(not(any( - target_feature = "quadword-atomics", - portable_atomic_target_feature = "quadword-atomics", -)))] -select_atomic_rmw! { - unsafe fn atomic_compare_exchange_ifunc(dst: *mut u128, old: u128, new: u128) -> (u128, bool); - pwr8 = atomic_compare_exchange_pwr8; - non_seqcst_fallback = atomic_compare_exchange_non_seqcst; - seqcst_fallback = atomic_compare_exchange_seqcst; -} select_atomic_rmw! { unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128; pwr8 = atomic_swap_pwr8; diff --git a/src/imp/atomic128/riscv64.rs b/src/imp/atomic128/riscv64.rs index 74fda982..cc33818b 100644 --- a/src/imp/atomic128/riscv64.rs +++ b/src/imp/atomic128/riscv64.rs @@ -1,10 +1,18 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT /* -128-bit atomic implementation on riscv64 using amocas.q (DWCAS). +128-bit atomic implementation on riscv64. + +This architecture provides the following 128-bit atomic instructions: + +- amocas.q: CAS (Zacas extension) + +See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and +detailed description of the atomic and synchronize instructions in this architecture: +https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#risc-v Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use -this module and use intrinsics.rs instead. +this module and use fallback implementation instead. Refs: - RISC-V Instruction Set Manual @@ -167,18 +175,18 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 { unsafe fn atomic_load_zacas(src: *mut u128, order: Ordering) -> u128 { debug_assert!(src as usize % 16 == 0); debug_assert_zacas!(); + let (out_lo, out_hi); // SAFETY: the caller must uphold the safety contract. unsafe { - let (out_lo, out_hi); // LLVM doesn't support `.option arch, +zabha` directive as of LLVM 19 because it is experimental. // So, we currently always using .4byte directive. // macro_rules! load { // ($fence:tt, $asm_order:tt) => { // asm!( // start_zacas!(), - // $fence, - // concat!("amocas.q", $asm_order, " a2, a2, 0({src})"), + // $fence, // fence + // concat!("amocas.q", $asm_order, " a2, a2, 0({src})"), // atomic { if *dst == a2:a3 { *dst = a2:a3 } else { a2:a3 = *dst } } // end_zacas!(), // src = in(reg) ptr_reg!(src), // inout("a2") 0_u64 => out_lo, @@ -191,8 +199,8 @@ unsafe fn atomic_load_zacas(src: *mut u128, order: Ordering) -> u128 { macro_rules! load { ($fence:tt, $insn_order:tt) => { asm!( - $fence, - // 4: 2{8,c,a,e}c5462f amocas.q{,.aq,.rl,.aqrl} a2, a2, (a0) + $fence, // fence + // amocas.q{,.aq,.rl,.aqrl} a2, a2, (a0) // atomic { if *a0 == a2:a3 { *a0 = a2:a3 } else { a2:a3 = *a0 } } concat!(".4byte 0x2", $insn_order, "c5462f"), in("a0") ptr_reg!(src), inout("a2") 0_u64 => out_lo, @@ -316,20 +324,20 @@ unsafe fn atomic_compare_exchange_zacas( debug_assert!(dst as usize % 16 == 0); debug_assert_zacas!(); let order = crate::utils::upgrade_success_ordering(success, failure); + let old = U128 { whole: old }; + let new = U128 { whole: new }; + let (prev_lo, prev_hi); // SAFETY: the caller must uphold the safety contract. - let prev = unsafe { - let old = U128 { whole: old }; - let new = U128 { whole: new }; - let (prev_lo, prev_hi); + unsafe { // LLVM doesn't support `.option arch, +zabha` directive as of LLVM 19 because it is experimental. // So, we currently always using .4byte directive. // macro_rules! cmpxchg { // ($fence:tt, $asm_order:tt) => { // asm!( // start_zacas!(), - // $fence, - // concat!("amocas.q", $asm_order, " a4, a2, 0({dst})"), + // $fence, // fence + // concat!("amocas.q", $asm_order, " a4, a2, 0({dst})"), // atomic { if *dst == a4:a5 { *dst = a2:a3 } else { a4:a5 = *dst } } // end_zacas!(), // dst = in(reg) ptr_reg!(dst), // // must be allocated to even/odd register pair @@ -346,8 +354,8 @@ unsafe fn atomic_compare_exchange_zacas( macro_rules! cmpxchg { ($fence:tt, $insn_order:tt) => { asm!( - $fence, - // c: 2{8,c,a,e}c5472f amocas.q{,.aq,.rl,.aqrl} a4, a2, (a0) + $fence, // fence + // amocas.q{,.aq,.rl,.aqrl} a4, a2, (a0) // atomic { if *a0 == a4:a5 { *a0 = a2:a3 } else { a4:a5 = *a0 } } concat!(".4byte 0x2", $insn_order, "c5472f"), in("a0") ptr_reg!(dst), // must be allocated to even/odd register pair @@ -361,23 +369,24 @@ unsafe fn atomic_compare_exchange_zacas( }; } atomic_rmw_amocas_order_insn!(cmpxchg, order, failure = failure); - U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole - }; - (prev, prev == old) + let prev = U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole; + (prev, prev == old.whole) + } } // amocas is always strong. use self::atomic_compare_exchange as atomic_compare_exchange_weak; -// 128-bit atomic load by two 64-bit atomic loads. (see arm_linux.rs for more) +// 128-bit atomic load by two 64-bit atomic loads. #[inline] unsafe fn byte_wise_atomic_load(src: *const u128) -> u128 { + let (out_lo, out_hi); + // SAFETY: the caller must uphold the safety contract. unsafe { - let (out_lo, out_hi); asm!( - "ld {out_lo}, ({src})", - "ld {out_hi}, 8({src})", + "ld {out_lo}, ({src})", // atomic { out_lo = *src } + "ld {out_hi}, 8({src})", // atomic { out_hi = *src.add(8) } src = in(reg) ptr_reg!(src), out_lo = out(reg) out_lo, out_hi = out(reg) out_hi, @@ -387,28 +396,10 @@ unsafe fn byte_wise_atomic_load(src: *const u128) -> u128 { } } -#[inline(always)] -unsafe fn atomic_update_zacas(dst: *mut u128, order: Ordering, mut f: F) -> u128 -where - F: FnMut(u128) -> u128, -{ - // SAFETY: the caller must uphold the safety contract. - unsafe { - let mut prev = byte_wise_atomic_load(dst); - loop { - let next = f(prev); - match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) { - Ok(x) => return x, - Err(x) => prev = x, - } - } - } -} - macro_rules! select_atomic_rmw { ( unsafe fn $name:ident(dst: *mut u128 $(, $($arg:tt)*)?) $(-> $ret_ty:ty)? { - $($zacas_fn_body:tt)* + |$zacas_fn_binding:ident| $($zacas_fn_body:tt)* } zacas = $zacas_fn:ident; non_seqcst_fallback = $non_seqcst_fallback_fn:ident; @@ -417,7 +408,26 @@ macro_rules! select_atomic_rmw { #[inline] unsafe fn $zacas_fn(dst: *mut u128 $(, $($arg)*)?, order: Ordering) $(-> $ret_ty)? { // SAFETY: the caller must uphold the safety contract. - unsafe { atomic_update_zacas(dst, order, $($zacas_fn_body)*) } + unsafe { + // This is not single-copy atomic reads, but this is ok because subsequent + // CAS will check for consistency. + // + // Note that the C++20 memory model does not allow mixed-sized atomic access, + // so we must use inline assembly to implement byte_wise_atomic_load. + // (i.e., byte-wise atomic based on the standard library's atomic types + // cannot be used here). + let mut prev = byte_wise_atomic_load(dst); + loop { + let next = { + let $zacas_fn_binding = prev; + $($zacas_fn_body)* + }; + match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) { + Ok(x) => return x, + Err(x) => prev = x, + } + } + } } // If zacas is available at compile-time, we can always use zacas_fn. #[cfg(any( @@ -501,7 +511,7 @@ macro_rules! select_atomic_rmw { select_atomic_rmw! { unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128 { - |_| val + |_x| val } zacas = atomic_swap_zacas; non_seqcst_fallback = atomic_swap_non_seqcst; @@ -603,7 +613,7 @@ select_atomic_rmw! { } select_atomic_rmw! { unsafe fn atomic_neg(dst: *mut u128) -> u128 { - u128::wrapping_neg + |x| x.wrapping_neg() } zacas = atomic_neg_zacas; non_seqcst_fallback = atomic_neg_non_seqcst; diff --git a/src/imp/atomic128/s390x.rs b/src/imp/atomic128/s390x.rs index d77db40f..9ff9adda 100644 --- a/src/imp/atomic128/s390x.rs +++ b/src/imp/atomic128/s390x.rs @@ -3,13 +3,16 @@ /* 128-bit atomic implementation on s390x. -s390x has 128-bit atomic load/store/CAS instructions and other operations are emulated by CAS loop. -https://github.com/llvm/llvm-project/commit/a11f63a952664f700f076fd754476a2b9eb158cc -https://github.com/llvm/llvm-project/commit/c568927f3e2e7d9804ea74ecbf11c16c014ddcbc +This architecture provides the following 128-bit atomic instructions: -128-bit atomic instructions (lpq,stpq,cdsg) has been present since -[the First Edition of the Principles of Operation](https://publibfp.dhe.ibm.com/epubs/pdf/dz9zr000.pdf). -LLVM's minimal supported architecture level is z10 (the Eighth Edition of the PoP): +- LPQ/STPQ: load/store (arch1 or later, i.e., baseline) +- CDSG: CAS (arch1 or later, i.e., baseline) + +See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and +detailed description of the atomic and synchronize instructions in this architecture: +https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#s390x + +LLVM's minimal supported architecture level is arch8 (z10): https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/SystemZ/SystemZProcessors.td#L16-L17 This does not appear to have changed since the current s390x backend was added in LLVM 3.3: https://github.com/llvm/llvm-project/commit/5f613dfd1f7edb0ae95d521b7107b582d9df5103#diff-cbaef692b3958312e80fd5507a7e2aff071f1acb086f10e8a96bc06a7bb289db @@ -18,13 +21,16 @@ Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't this module and use intrinsics.rs instead. Refs: -- z/Architecture Principles of Operation https://publibfp.dhe.ibm.com/epubs/pdf/a227832d.pdf -- atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit +- z/Architecture Principles of Operation, Fourteenth Edition (SA22-7832-13) + https://publibfp.dhe.ibm.com/epubs/pdf/a227832d.pdf +- atomic-maybe-uninit + https://github.com/taiki-e/atomic-maybe-uninit Generated asm: -- s390x https://godbolt.org/z/osTYK1Mfz -- s390x (z196) https://godbolt.org/z/K71PKbnPT -- s390x (z15) https://godbolt.org/z/dfP1YKc1d +- s390x https://godbolt.org/z/oPxYYEvPG +- s390x (z196) https://godbolt.org/z/M69KrKT7Y +- s390x (z15,-vector) https://godbolt.org/z/Wec8b3ada +- s390x (z15) https://godbolt.org/z/KxWcrbfYh */ include!("macros.rs"); @@ -33,6 +39,26 @@ use core::{arch::asm, sync::atomic::Ordering}; use crate::utils::{Pair, U128}; +// bcr 14,0 requires fast-BCR-serialization facility added in arch9 (z196). +#[cfg(any( + target_feature = "fast-serialization", + portable_atomic_target_feature = "fast-serialization", +))] +macro_rules! serialization { + () => { + "bcr 14, 0" + }; +} +#[cfg(not(any( + target_feature = "fast-serialization", + portable_atomic_target_feature = "fast-serialization", +)))] +macro_rules! serialization { + () => { + "bcr 15, 0" + }; +} + // Use distinct operands on z196 or later, otherwise split to lgr and $op. #[cfg(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops"))] macro_rules! distinct_op { @@ -76,7 +102,7 @@ macro_rules! select_op { } // Extracts and checks condition code. -#[inline(always)] +#[inline] fn extract_cc(r: i64) -> bool { r.wrapping_add(-268435456) & (1 << 31) != 0 } @@ -84,13 +110,13 @@ fn extract_cc(r: i64) -> bool { #[inline] unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 { debug_assert!(src as usize % 16 == 0); + let (out_hi, out_lo); // SAFETY: the caller must uphold the safety contract. unsafe { // atomic load is always SeqCst. - let (out_hi, out_lo); asm!( - "lpq %r0, 0({src})", + "lpq %r0, 0({src})", // atomic { r0:r1 = *src } src = in(reg) ptr_reg!(src), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. out("r0") out_hi, @@ -104,15 +130,15 @@ unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 { #[inline] unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) { debug_assert!(dst as usize % 16 == 0); + let val = U128 { whole: val }; // SAFETY: the caller must uphold the safety contract. unsafe { - let val = U128 { whole: val }; macro_rules! atomic_store { - ($fence:tt) => { + ($acquire:expr) => { asm!( - "stpq %r0, 0({dst})", - $fence, + "stpq %r0, 0({dst})", // atomic { *dst = r0:r1 } + $acquire, // fence dst = in(reg) ptr_reg!(dst), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. in("r0") val.pair.hi, @@ -124,17 +150,7 @@ unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) { match order { // Relaxed and Release stores are equivalent. Ordering::Relaxed | Ordering::Release => atomic_store!(""), - // bcr 14,0 (fast-BCR-serialization) requires z196 or later. - #[cfg(any( - target_feature = "fast-serialization", - portable_atomic_target_feature = "fast-serialization", - ))] - Ordering::SeqCst => atomic_store!("bcr 14, 0"), - #[cfg(not(any( - target_feature = "fast-serialization", - portable_atomic_target_feature = "fast-serialization", - )))] - Ordering::SeqCst => atomic_store!("bcr 15, 0"), + Ordering::SeqCst => atomic_store!(serialization!()), _ => unreachable!(), } } @@ -149,17 +165,17 @@ unsafe fn atomic_compare_exchange( _failure: Ordering, ) -> Result { debug_assert!(dst as usize % 16 == 0); - + let old = U128 { whole: old }; + let new = U128 { whole: new }; + let (prev_hi, prev_lo); let r; + // SAFETY: the caller must uphold the safety contract. let prev = unsafe { // atomic CAS is always SeqCst. - let old = U128 { whole: old }; - let new = U128 { whole: new }; - let (prev_hi, prev_lo); asm!( - "cdsg %r0, %r12, 0({dst})", - "ipm {r}", + "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:13 } else { cc = 1; r0:r1 = *dst } } + "ipm {r}", // r[:] = cc dst = in(reg) ptr_reg!(dst), r = lateout(reg) r, // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. @@ -182,6 +198,28 @@ unsafe fn atomic_compare_exchange( // cdsg is always strong. use self::atomic_compare_exchange as atomic_compare_exchange_weak; +// 128-bit atomic load by two 64-bit atomic loads. +#[cfg(not(any( + target_feature = "load-store-on-cond", + portable_atomic_target_feature = "load-store-on-cond", +)))] +#[inline] +unsafe fn byte_wise_atomic_load(src: *const u128) -> u128 { + // SAFETY: the caller must uphold the safety contract. + unsafe { + let (out_hi, out_lo); + asm!( + "lg {out_hi}, 8({src})", // atomic { out_hi = *src.add(8) } + "lg {out_lo}, 0({src})", // atomic { out_lo = *src } + src = in(reg) src, + out_hi = out(reg) out_hi, + out_lo = out(reg) out_lo, + options(pure, nostack, preserves_flags, readonly), + ); + U128 { pair: Pair { hi: out_hi, lo: out_lo } }.whole + } +} + #[cfg(not(any( target_feature = "load-store-on-cond", portable_atomic_target_feature = "load-store-on-cond", @@ -193,9 +231,14 @@ where { // SAFETY: the caller must uphold the safety contract. unsafe { - // This is a private function and all instances of `f` only operate on the value - // loaded, so there is no need to synchronize the first load/failed CAS. - let mut prev = atomic_load(dst, Ordering::Relaxed); + // This is not single-copy atomic reads, but this is ok because subsequent + // CAS will check for consistency. + // + // Note that the C++20 memory model does not allow mixed-sized atomic access, + // so we must use inline assembly to implement byte_wise_atomic_load. + // (i.e., byte-wise atomic based on the standard library's atomic types + // cannot be used here). + let mut prev = byte_wise_atomic_load(dst); loop { let next = f(prev); match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) { @@ -209,6 +252,8 @@ where #[inline] unsafe fn atomic_swap(dst: *mut u128, val: u128, _order: Ordering) -> u128 { debug_assert!(dst as usize % 16 == 0); + let val = U128 { whole: val }; + let (mut prev_hi, mut prev_lo); // SAFETY: the caller must uphold the safety contract. // @@ -218,13 +263,12 @@ unsafe fn atomic_swap(dst: *mut u128, val: u128, _order: Ordering) -> u128 { // Do not use atomic_rmw_cas_3 because it needs extra LGR to implement swap. unsafe { // atomic swap is always SeqCst. - let val = U128 { whole: val }; - let (mut prev_hi, mut prev_lo); asm!( - "lpq %r0, 0({dst})", - "2:", - "cdsg %r0, %r12, 0({dst})", - "jl 2b", + "lg %r0, 8({dst})", // atomic { r0 = *dst.add(8) } + "lg %r1, 0({dst})", // atomic { r1 = *dst } + "2:", // 'retry: + "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } } + "jl 2b", // if cc == 1 { jump 'retry } dst = in(reg) ptr_reg!(dst), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. out("r0") prev_hi, @@ -252,17 +296,19 @@ macro_rules! atomic_rmw_cas_3 { #[inline] unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 { debug_assert!(dst as usize % 16 == 0); + let val = U128 { whole: val }; + let (mut prev_hi, mut prev_lo); + // SAFETY: the caller must uphold the safety contract. unsafe { // atomic RMW is always SeqCst. - let val = U128 { whole: val }; - let (mut prev_hi, mut prev_lo); asm!( - "lpq %r0, 0({dst})", - "2:", + "lg %r0, 8({dst})", // atomic { r0 = *dst.add(8) } + "lg %r1, 0({dst})", // atomic { r1 = *dst } + "2:", // 'retry: $($op)* - "cdsg %r0, %r12, 0({dst})", - "jl 2b", + "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } } + "jl 2b", // if cc == 1 { jump 'retry } dst = in(reg) ptr_reg!(dst), val_hi = in(reg) val.pair.hi, val_lo = in(reg) val.pair.lo, @@ -293,16 +339,18 @@ macro_rules! atomic_rmw_cas_2 { #[inline] unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 { debug_assert!(dst as usize % 16 == 0); + let (mut prev_hi, mut prev_lo); + // SAFETY: the caller must uphold the safety contract. unsafe { // atomic RMW is always SeqCst. - let (mut prev_hi, mut prev_lo); asm!( - "lpq %r0, 0({dst})", - "2:", + "lg %r0, 8({dst})", // atomic { r0 = *dst.add(8) } + "lg %r1, 0({dst})", // atomic { r1 = *dst } + "2:", // 'retry: $($op)* - "cdsg %r0, %r12, 0({dst})", - "jl 2b", + "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } } + "jl 2b", // if cc == 1 { jump 'retry } dst = in(reg) ptr_reg!(dst), $($reg)* // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. @@ -321,20 +369,20 @@ macro_rules! atomic_rmw_cas_2 { atomic_rmw_cas_3! { atomic_add, [], - distinct_op!("algr", "%r13", "%r1", "{val_lo}"), - "lgr %r12, %r0", - "alcgr %r12, {val_hi}", + distinct_op!("algr", "%r13", "%r1", "{val_lo}"), // r13 = r1 + val_lo; cc = zero | carry + "lgr %r12, %r0", // r12 = r0 + "alcgr %r12, {val_hi}", // r12 += val_hi + carry } atomic_rmw_cas_3! { atomic_sub, [], - distinct_op!("slgr", "%r13", "%r1", "{val_lo}"), - "lgr %r12, %r0", - "slbgr %r12, {val_hi}", + distinct_op!("slgr", "%r13", "%r1", "{val_lo}"), // r13 = r1 - val_lo; cc = zero | borrow + "lgr %r12, %r0", // r12 = r0 + "slbgr %r12, {val_hi}", // r12 -= val_hi + borrow } atomic_rmw_cas_3! { atomic_and, [], - distinct_op!("ngr", "%r13", "%r1", "{val_lo}"), - distinct_op!("ngr", "%r12", "%r0", "{val_hi}"), + distinct_op!("ngr", "%r13", "%r1", "{val_lo}"), // r13 = r1 & val_lo + distinct_op!("ngr", "%r12", "%r0", "{val_hi}"), // r12 = r0 & val_hi } // Use nngrk on z15 or later. @@ -344,8 +392,8 @@ atomic_rmw_cas_3! { ))] atomic_rmw_cas_3! { atomic_nand, [], - "nngrk %r13, %r1, {val_lo}", - "nngrk %r12, %r0, {val_hi}", + "nngrk %r13, %r1, {val_lo}", // r13 = !(r1 & val_lo) + "nngrk %r12, %r0, {val_hi}", // r12 = !(r0 & val_hi) } #[cfg(not(any( target_feature = "miscellaneous-extensions-3", @@ -353,23 +401,23 @@ atomic_rmw_cas_3! { )))] atomic_rmw_cas_3! { atomic_nand, [], - distinct_op!("ngr", "%r13", "%r1", "{val_lo}"), - distinct_op!("ngr", "%r12", "%r0", "{val_hi}"), - "lcgr %r13, %r13", - "aghi %r13, -1", - "lcgr %r12, %r12", - "aghi %r12, -1", + distinct_op!("ngr", "%r13", "%r1", "{val_lo}"), // r13 = r1 & val_lo + distinct_op!("ngr", "%r12", "%r0", "{val_hi}"), // r12 = r0 & val_hi + "lcgr %r13, %r13", // r13 = !r13 + 1 + "aghi %r13, -1", // r13 -= 1 + "lcgr %r12, %r12", // r12 = !r12 + 1 + "aghi %r12, -1", // r12 -= 1 } atomic_rmw_cas_3! { atomic_or, [], - distinct_op!("ogr", "%r13", "%r1", "{val_lo}"), - distinct_op!("ogr", "%r12", "%r0", "{val_hi}"), + distinct_op!("ogr", "%r13", "%r1", "{val_lo}"), // r13 = r1 | val_lo + distinct_op!("ogr", "%r12", "%r0", "{val_hi}"), // r12 = r0 | val_hi } atomic_rmw_cas_3! { atomic_xor, [], - distinct_op!("xgr", "%r13", "%r1", "{val_lo}"), - distinct_op!("xgr", "%r12", "%r0", "{val_hi}"), + distinct_op!("xgr", "%r13", "%r1", "{val_lo}"), // r13 = r1 ^ val_lo + distinct_op!("xgr", "%r12", "%r0", "{val_hi}"), // r12 = r0 ^ val_hi } #[cfg(any( @@ -378,12 +426,12 @@ atomic_rmw_cas_3! { ))] atomic_rmw_cas_3! { atomic_max, [], - "clgr %r1, {val_lo}", - select_op!("h", "%r12", "%r1", "{val_lo}"), - "cgr %r0, {val_hi}", - select_op!("h", "%r13", "%r1", "{val_lo}"), - "locgre %r13, %r12", - select_op!("h", "%r12", "%r0", "{val_hi}"), + "clgr %r1, {val_lo}", // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 } + select_op!("h", "%r12", "%r1", "{val_lo}"), // if cc == 2 { r12 = r1 } else { r12 = val_lo } + "cgr %r0, {val_hi}", // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 } + select_op!("h", "%r13", "%r1", "{val_lo}"), // if cc == 2 { r13 = r1 } else { r13 = val_lo } + "locgre %r13, %r12", // if cc == 0 { r13 = r12 } + select_op!("h", "%r12", "%r0", "{val_hi}"), // if cc == 2 { r12 = r0 } else { r12 = val_hi } } #[cfg(any( target_feature = "load-store-on-cond", @@ -391,13 +439,13 @@ atomic_rmw_cas_3! { ))] atomic_rmw_cas_3! { atomic_umax, [tmp = out(reg) _,], - "clgr %r1, {val_lo}", - select_op!("h", "{tmp}", "%r1", "{val_lo}"), - "clgr %r0, {val_hi}", - select_op!("h", "%r12", "%r0", "{val_hi}"), - select_op!("h", "%r13", "%r1", "{val_lo}"), - "cgr %r0, {val_hi}", - "locgre %r13, {tmp}", + "clgr %r1, {val_lo}", // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 } + select_op!("h", "{tmp}", "%r1", "{val_lo}"), // if cc == 2 { tmp = r1 } else { tmp = val_lo } + "clgr %r0, {val_hi}", // if r0(u) < val_hi(u) { cc = 1 } else if r0(u) > val_hi(u) { cc = 2 } else { cc = 0 } + select_op!("h", "%r12", "%r0", "{val_hi}"), // if cc == 2 { r12 = r0 } else { r12 = val_hi } + select_op!("h", "%r13", "%r1", "{val_lo}"), // if cc == 2 { r13 = r1 } else { r13 = val_lo } + "cgr %r0, {val_hi}", // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 } + "locgre %r13, {tmp}", // if cc == 0 { r13 = tmp } } #[cfg(any( target_feature = "load-store-on-cond", @@ -405,12 +453,12 @@ atomic_rmw_cas_3! { ))] atomic_rmw_cas_3! { atomic_min, [], - "clgr %r1, {val_lo}", - select_op!("l", "%r12", "%r1", "{val_lo}"), - "cgr %r0, {val_hi}", - select_op!("l", "%r13", "%r1", "{val_lo}"), - "locgre %r13, %r12", - select_op!("l", "%r12", "%r0", "{val_hi}"), + "clgr %r1, {val_lo}", // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 } + select_op!("l", "%r12", "%r1", "{val_lo}"), // if cc == 1 { r12 = r1 } else { r12 = val_lo } + "cgr %r0, {val_hi}", // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 } + select_op!("l", "%r13", "%r1", "{val_lo}"), // if cc == 1 { r13 = r1 } else { r13 = val_lo } + "locgre %r13, %r12", // if cc == 0 { r13 = r12 } + select_op!("l", "%r12", "%r0", "{val_hi}"), // if cc == 1 { r12 = r0 } else { r12 = val_hi } } #[cfg(any( target_feature = "load-store-on-cond", @@ -418,13 +466,13 @@ atomic_rmw_cas_3! { ))] atomic_rmw_cas_3! { atomic_umin, [tmp = out(reg) _,], - "clgr %r1, {val_lo}", - select_op!("l", "{tmp}", "%r1", "{val_lo}"), - "clgr %r0, {val_hi}", - select_op!("l", "%r12", "%r0", "{val_hi}"), - select_op!("l", "%r13", "%r1", "{val_lo}"), - "cgr %r0, {val_hi}", - "locgre %r13, {tmp}", + "clgr %r1, {val_lo}", // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 } + select_op!("l", "{tmp}", "%r1", "{val_lo}"), // if cc == 1 { tmp = r1 } else { tmp = val_lo } + "clgr %r0, {val_hi}", // if r0(u) < val_hi(u) { cc = 1 } else if r0(u) > val_hi(u) { cc = 2 } else { cc = 0 } + select_op!("l", "%r12", "%r0", "{val_hi}"), // if cc == 1 { r12 = r0 } else { r12 = val_hi } + select_op!("l", "%r13", "%r1", "{val_lo}"), // if cc == 1 { r13 = r1 } else { r13 = val_lo } + "cgr %r0, {val_hi}", // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 } + "locgre %r13, {tmp}", // if cc == 0 { r13 = tmp } } // We use atomic_update for atomic min/max on pre-z196 because // z10 doesn't seem to have a good way to implement 128-bit min/max. @@ -438,26 +486,26 @@ atomic_rmw_by_atomic_update!(cmp); atomic_rmw_cas_2! { atomic_not, [], - "lcgr %r13, %r1", - "aghi %r13, -1", - "lcgr %r12, %r0", - "aghi %r12, -1", + "lcgr %r13, %r1", // r13 = !r1 + 1 + "aghi %r13, -1", // r13 -= 1 + "lcgr %r12, %r0", // r12 = !r0 + 1 + "aghi %r12, -1", // r12 -= 1 } #[cfg(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops"))] atomic_rmw_cas_2! { atomic_neg, [zero = in(reg) 0_u64,], - "slgrk %r13, {zero}, %r1", - "lghi %r12, 0", - "slbgr %r12, %r0", + "slgrk %r13, {zero}, %r1", // r13 = 0 - r1; cc = zero | borrow + "lghi %r12, 0", // r12 = 0 + "slbgr %r12, %r0", // r12 -= r0 + borrow } #[cfg(not(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops")))] atomic_rmw_cas_2! { atomic_neg, [], - "lghi %r13, 0", - "slgr %r13, %r1", - "lghi %r12, 0", - "slbgr %r12, %r0", + "lghi %r13, 0", // r13 = 0 + "slgr %r13, %r1", // r13 -= r1; cc = zero | borrow + "lghi %r12, 0", // r12 = 0 + "slbgr %r12, %r0", // r12 -= r0 + borrow } #[inline] diff --git a/src/imp/atomic64/riscv32.rs b/src/imp/atomic64/riscv32.rs index 8e3084f8..c4c7c315 100644 --- a/src/imp/atomic64/riscv32.rs +++ b/src/imp/atomic64/riscv32.rs @@ -1,7 +1,15 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT /* -64-bit atomic implementation on riscv32 using amocas.d (DWCAS). +64-bit atomic implementation on riscv32. + +This architecture provides the following 64-bit atomic instructions: + +- amocas.d: CAS (Zacas extension) + +See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and +detailed description of the atomic and synchronize instructions in this architecture: +https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#risc-v Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use this module and use fallback implementation instead. @@ -169,18 +177,18 @@ unsafe fn atomic_load(src: *mut u64, order: Ordering) -> u64 { unsafe fn atomic_load_zacas(src: *mut u64, order: Ordering) -> u64 { debug_assert!(src as usize % 8 == 0); debug_assert_zacas!(); + let (out_lo, out_hi); // SAFETY: the caller must uphold the safety contract. unsafe { - let (out_lo, out_hi); // LLVM doesn't support `.option arch, +zabha` directive as of LLVM 19 because it is experimental. // So, we currently always using .4byte directive. // macro_rules! load { // ($fence:tt, $asm_order:tt) => { // asm!( // start_zacas!(), - // $fence, - // concat!("amocas.d", $asm_order, " a2, a2, 0({src})"), + // $fence, // fence + // concat!("amocas.d", $asm_order, " a2, a2, 0({src})"), // atomic { if *dst == a2:a3 { *dst = a2:a3 } else { a2:a3 = *dst } } // end_zacas!(), // src = in(reg) ptr_reg!(src), // inout("a2") 0_u32 => out_lo, @@ -193,8 +201,8 @@ unsafe fn atomic_load_zacas(src: *mut u64, order: Ordering) -> u64 { macro_rules! load { ($fence:tt, $insn_order:tt) => { asm!( - $fence, - // 4: 2{8,c,a,e}c5362f amocas.d{,.aq,.rl,.aqrl} a2, a2, (a0) + $fence, // fence + // amocas.d{,.aq,.rl,.aqrl} a2, a2, (a0) // atomic { if *a0 == a2:a3 { *a0 = a2:a3 } else { a2:a3 = *a0 } } concat!(".4byte 0x2", $insn_order, "c5362f"), in("a0") ptr_reg!(src), inout("a2") 0_u32 => out_lo, @@ -318,20 +326,20 @@ unsafe fn atomic_compare_exchange_zacas( debug_assert!(dst as usize % 8 == 0); debug_assert_zacas!(); let order = crate::utils::upgrade_success_ordering(success, failure); + let old = U64 { whole: old }; + let new = U64 { whole: new }; + let (prev_lo, prev_hi); // SAFETY: the caller must uphold the safety contract. - let prev = unsafe { - let old = U64 { whole: old }; - let new = U64 { whole: new }; - let (prev_lo, prev_hi); + unsafe { // LLVM doesn't support `.option arch, +zabha` directive as of LLVM 19 because it is experimental. // So, we currently always using .4byte directive. // macro_rules! cmpxchg { // ($fence:tt, $asm_order:tt) => { // asm!( // start_zacas!(), - // $fence, - // concat!("amocas.d", $asm_order, " a4, a2, 0({dst})"), + // $fence, // fence + // concat!("amocas.d", $asm_order, " a4, a2, 0({dst})"), // atomic { if *dst == a4:a5 { *dst = a2:a3 } else { a4:a5 = *dst } } // end_zacas!(), // dst = in(reg) ptr_reg!(dst), // // must be allocated to even/odd register pair @@ -348,8 +356,8 @@ unsafe fn atomic_compare_exchange_zacas( macro_rules! cmpxchg { ($fence:tt, $insn_order:tt) => { asm!( - $fence, - // 10: 2{8,c,a,e}c5372f amocas.d{,.aq,.rl,.aqrl} a4, a2, (a0) + $fence, // fence + // amocas.d{,.aq,.rl,.aqrl} a4, a2, (a0) // atomic { if *a0 == a4:a5 { *a0 = a2:a3 } else { a4:a5 = *a0 } } concat!(".4byte 0x2", $insn_order, "c5372f"), in("a0") ptr_reg!(dst), // must be allocated to even/odd register pair @@ -363,23 +371,24 @@ unsafe fn atomic_compare_exchange_zacas( }; } atomic_rmw_amocas_order_insn!(cmpxchg, order, failure = failure); - U64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole - }; - (prev, prev == old) + let prev = U64 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole; + (prev, prev == old.whole) + } } // amocas is always strong. use self::atomic_compare_exchange as atomic_compare_exchange_weak; -// 64-bit atomic load by two 32-bit atomic loads. (see arm_linux.rs for more) +// 64-bit atomic load by two 32-bit atomic loads. #[inline] unsafe fn byte_wise_atomic_load(src: *const u64) -> u64 { + let (out_lo, out_hi); + // SAFETY: the caller must uphold the safety contract. unsafe { - let (out_lo, out_hi); asm!( - "lw {out_lo}, ({src})", - "lw {out_hi}, 4({src})", + "lw {out_lo}, ({src})", // atomic { out_lo = *src } + "lw {out_hi}, 4({src})", // atomic { out_hi = *src.add(4) } src = in(reg) ptr_reg!(src), out_lo = out(reg) out_lo, out_hi = out(reg) out_hi, @@ -389,28 +398,10 @@ unsafe fn byte_wise_atomic_load(src: *const u64) -> u64 { } } -#[inline(always)] -unsafe fn atomic_update_zacas(dst: *mut u64, order: Ordering, mut f: F) -> u64 -where - F: FnMut(u64) -> u64, -{ - // SAFETY: the caller must uphold the safety contract. - unsafe { - let mut prev = byte_wise_atomic_load(dst); - loop { - let next = f(prev); - match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) { - Ok(x) => return x, - Err(x) => prev = x, - } - } - } -} - macro_rules! select_atomic_rmw { ( unsafe fn $name:ident(dst: *mut u64 $(, $($arg:tt)*)?) $(-> $ret_ty:ty)? { - $($zacas_fn_body:tt)* + |$zacas_fn_binding:ident| $($zacas_fn_body:tt)* } zacas = $zacas_fn:ident; non_seqcst_fallback = $non_seqcst_fallback_fn:ident; @@ -419,7 +410,26 @@ macro_rules! select_atomic_rmw { #[inline] unsafe fn $zacas_fn(dst: *mut u64 $(, $($arg)*)?, order: Ordering) $(-> $ret_ty)? { // SAFETY: the caller must uphold the safety contract. - unsafe { atomic_update_zacas(dst, order, $($zacas_fn_body)*) } + unsafe { + // This is not single-copy atomic reads, but this is ok because subsequent + // CAS will check for consistency. + // + // Note that the C++20 memory model does not allow mixed-sized atomic access, + // so we must use inline assembly to implement byte_wise_atomic_load. + // (i.e., byte-wise atomic based on the standard library's atomic types + // cannot be used here). + let mut prev = byte_wise_atomic_load(dst); + loop { + let next = { + let $zacas_fn_binding = prev; + $($zacas_fn_body)* + }; + match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) { + Ok(x) => return x, + Err(x) => prev = x, + } + } + } } // If zacas is available at compile-time, we can always use zacas_fn. #[cfg(any( @@ -503,7 +513,7 @@ macro_rules! select_atomic_rmw { select_atomic_rmw! { unsafe fn atomic_swap(dst: *mut u64, val: u64) -> u64 { - |_| val + |_x| val } zacas = atomic_swap_zacas; non_seqcst_fallback = atomic_swap_non_seqcst; @@ -605,7 +615,7 @@ select_atomic_rmw! { } select_atomic_rmw! { unsafe fn atomic_neg(dst: *mut u64) -> u64 { - u64::wrapping_neg + |x| x.wrapping_neg() } zacas = atomic_neg_zacas; non_seqcst_fallback = atomic_neg_non_seqcst; diff --git a/src/imp/avr.rs b/src/imp/avr.rs new file mode 100644 index 00000000..36f530e4 --- /dev/null +++ b/src/imp/avr.rs @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + +/* +Atomic load/store implementation on AVR. + +Operations not supported here are provided by disabling interrupts. +See also src/imp/interrupt/avr.rs. + +See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and +detailed description of the atomic and synchronize instructions in this architecture: +https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#avr + +Note: Ordering is always SeqCst. + +Refs: +- AVR® Instruction Set Manual, Rev. DS40002198B + https://ww1.microchip.com/downloads/en/DeviceDoc/AVR-InstructionSet-Manual-DS40002198.pdf +- atomic-maybe-uninit + https://github.com/taiki-e/atomic-maybe-uninit + +Generated asm: +- avr https://godbolt.org/z/j49rYbj4d +*/ + +use core::{arch::asm, cell::UnsafeCell, sync::atomic::Ordering}; + +macro_rules! atomic8 { + ($atomic_type:ident, $value_type:ty) => { + #[repr(transparent)] + pub(crate) struct $atomic_type { + v: UnsafeCell<$value_type>, + } + + // Send is implicitly implemented for atomic integers, but not for atomic pointers. + // SAFETY: any data races are prevented by atomic operations. + unsafe impl Send for $atomic_type {} + // SAFETY: any data races are prevented by atomic operations. + unsafe impl Sync for $atomic_type {} + + impl $atomic_type { + #[inline] + #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] + pub(crate) fn load(&self, order: Ordering) -> $value_type { + crate::utils::assert_load_ordering(order); + let src = self.v.get(); + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + let out; + asm!( + "ld {out}, Z", // atomic { out = *Z } + in("Z") src, + out = out(reg) out, + options(nostack, preserves_flags), + ); + out + } + } + + #[inline] + #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] + pub(crate) fn store(&self, val: $value_type, order: Ordering) { + crate::utils::assert_store_ordering(order); + let dst = self.v.get(); + // SAFETY: any data races are prevented by atomic intrinsics and the raw + // pointer passed in is valid because we got it from a reference. + unsafe { + asm!( + "st Z, {val}", // atomic { *Z = val } + in("Z") dst, + val = in(reg) val, + options(nostack, preserves_flags), + ); + } + } + } + }; +} + +atomic8!(AtomicI8, i8); +atomic8!(AtomicU8, u8); diff --git a/src/imp/interrupt/README.md b/src/imp/interrupt/README.md index bf471795..ccc74395 100644 --- a/src/imp/interrupt/README.md +++ b/src/imp/interrupt/README.md @@ -25,6 +25,7 @@ For some targets, the implementation can be changed by explicitly enabling featu Some operations don't require disabling interrupts: - On architectures except for AVR: loads and stores with pointer size or smaller +- On AVR: 8-bit loads and stores - On MSP430 additionally: {8,16}-bit `add,sub,and,or,xor,not` - On RISC-V with the `zaamo` target feature (or `portable_atomic_target_feature="zaamo"` cfg or `force-amo` feature or `portable_atomic_force_amo` cfg) additionally: 32-bit(RV32)/{32,64}-bit(RV64) `swap,fetch_{add,sub,and,or,xor,not,max,min},add,sub,and,or,xor,not`, {8,16}-bit `fetch_{and,or,xor,not},and,or,xor,not`[^1], and all operations of `AtomicBool` diff --git a/src/imp/interrupt/avr.rs b/src/imp/interrupt/avr.rs index 6d479a68..e39f860d 100644 --- a/src/imp/interrupt/avr.rs +++ b/src/imp/interrupt/avr.rs @@ -3,8 +3,11 @@ /* Adapted from https://github.com/Rahix/avr-device. +See also src/imp/avr.rs. + Refs: -- AVR Instruction Set Manual https://ww1.microchip.com/downloads/en/DeviceDoc/AVR-InstructionSet-Manual-DS40002198.pdf +- AVR® Instruction Set Manual, Rev. DS40002198B + https://ww1.microchip.com/downloads/en/DeviceDoc/AVR-InstructionSet-Manual-DS40002198.pdf Generated asm: - avr https://godbolt.org/z/W5jxGsToc @@ -13,6 +16,9 @@ Generated asm: #[cfg(not(portable_atomic_no_asm))] use core::arch::asm; +#[cfg(not(portable_atomic_no_asm))] +pub(super) use super::super::avr as atomic; + pub(super) type State = u8; /// Disables interrupts and returns the previous interrupt state. @@ -27,9 +33,9 @@ pub(super) fn disable() -> State { // Refs: https://ww1.microchip.com/downloads/en/DeviceDoc/AVR-InstructionSet-Manual-DS40002198.pdf#page=58 #[cfg(not(portable_atomic_no_asm))] asm!( - "in {0}, 0x3F", - "cli", - out(reg) sreg, + "in {sreg}, 0x3F", // sreg = SREG + "cli", // SREG.I = 0 + sreg = out(reg) sreg, options(nostack), ); #[cfg(portable_atomic_no_asm)] @@ -47,7 +53,7 @@ pub(super) fn disable() -> State { /// /// The state must be the one retrieved by the previous `disable`. #[inline(always)] -pub(super) unsafe fn restore(sreg: State) { +pub(super) unsafe fn restore(prev_sreg: State) { // SAFETY: the caller must guarantee that the state was retrieved by the previous `disable`, unsafe { // This clobbers the entire status register. See msp430.rs to safety on this. @@ -55,8 +61,12 @@ pub(super) unsafe fn restore(sreg: State) { // Do not use `nomem` and `readonly` because prevent preceding memory accesses from being reordered after interrupts are enabled. // Do not use `preserves_flags` because OUT modifies the status register (SREG). #[cfg(not(portable_atomic_no_asm))] - asm!("out 0x3F, {0}", in(reg) sreg, options(nostack)); + asm!( + "out 0x3F, {prev_sreg}", // SREG = prev_sreg + prev_sreg = in(reg) prev_sreg, + options(nostack), + ); #[cfg(portable_atomic_no_asm)] - llvm_asm!("out 0x3F, $0" :: "r"(sreg) : "memory" : "volatile"); + llvm_asm!("out 0x3F, $0" :: "r"(prev_sreg) : "memory" : "volatile"); } } diff --git a/src/imp/interrupt/mod.rs b/src/imp/interrupt/mod.rs index 94fc4b76..d7713f7f 100644 --- a/src/imp/interrupt/mod.rs +++ b/src/imp/interrupt/mod.rs @@ -41,7 +41,10 @@ See also README.md of this directory. // CAS together with atomic load/store. The load/store will not be // called while interrupts are disabled, and since the load/store is // atomic, it is not affected by interrupts even if interrupts are enabled. -#[cfg(not(any(target_arch = "avr", feature = "critical-section")))] +#[cfg(not(any( + all(target_arch = "avr", portable_atomic_no_asm), + feature = "critical-section", +)))] use self::arch::atomic; #[cfg(not(feature = "critical-section"))] @@ -300,11 +303,17 @@ macro_rules! atomic_int { #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] pub(crate) fn load(&self, order: Ordering) -> $int_type { crate::utils::assert_load_ordering(order); - #[cfg(not(any(target_arch = "avr", feature = "critical-section")))] + #[cfg(not(any( + all(target_arch = "avr", portable_atomic_no_asm), + feature = "critical-section", + )))] { self.as_native().load(order) } - #[cfg(any(target_arch = "avr", feature = "critical-section"))] + #[cfg(any( + all(target_arch = "avr", portable_atomic_no_asm), + feature = "critical-section", + ))] // SAFETY: any data races are prevented by disabling interrupts (see // module-level comments) and the raw pointer is valid because we got it // from a reference. @@ -315,18 +324,27 @@ macro_rules! atomic_int { #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] pub(crate) fn store(&self, val: $int_type, order: Ordering) { crate::utils::assert_store_ordering(order); - #[cfg(not(any(target_arch = "avr", feature = "critical-section")))] + #[cfg(not(any( + all(target_arch = "avr", portable_atomic_no_asm), + feature = "critical-section", + )))] { self.as_native().store(val, order); } - #[cfg(any(target_arch = "avr", feature = "critical-section"))] + #[cfg(any( + all(target_arch = "avr", portable_atomic_no_asm), + feature = "critical-section", + ))] // SAFETY: any data races are prevented by disabling interrupts (see // module-level comments) and the raw pointer is valid because we got it // from a reference. with(|| unsafe { self.v.get().write(val) }); } - #[cfg(not(any(target_arch = "avr", feature = "critical-section")))] + #[cfg(not(any( + all(target_arch = "avr", portable_atomic_no_asm), + feature = "critical-section", + )))] #[inline(always)] fn as_native(&self) -> &atomic::$atomic_type { // SAFETY: $atomic_type and atomic::$atomic_type have the same layout and @@ -841,9 +859,15 @@ macro_rules! atomic_int { } #[cfg(target_pointer_width = "16")] +#[cfg(not(target_arch = "avr"))] atomic_int!(load_store_atomic, AtomicIsize, isize, 2); #[cfg(target_pointer_width = "16")] +#[cfg(not(target_arch = "avr"))] atomic_int!(load_store_atomic, AtomicUsize, usize, 2); +#[cfg(target_arch = "avr")] +atomic_int!(all_critical_session, AtomicIsize, isize, 2); +#[cfg(target_arch = "avr")] +atomic_int!(all_critical_session, AtomicUsize, usize, 2); #[cfg(target_pointer_width = "32")] atomic_int!(load_store_atomic, AtomicIsize, isize, 4); #[cfg(target_pointer_width = "32")] @@ -857,10 +881,22 @@ atomic_int!(load_store_atomic, AtomicIsize, isize, 16); #[cfg(target_pointer_width = "128")] atomic_int!(load_store_atomic, AtomicUsize, usize, 16); +#[cfg(not(all(target_arch = "avr", portable_atomic_no_asm)))] atomic_int!(load_store_atomic[sub_word], AtomicI8, i8, 1); +#[cfg(not(all(target_arch = "avr", portable_atomic_no_asm)))] atomic_int!(load_store_atomic[sub_word], AtomicU8, u8, 1); +#[cfg(all(target_arch = "avr", portable_atomic_no_asm))] +atomic_int!(all_critical_session, AtomicI8, i8, 1); +#[cfg(all(target_arch = "avr", portable_atomic_no_asm))] +atomic_int!(all_critical_session, AtomicU8, u8, 1); +#[cfg(not(target_arch = "avr"))] atomic_int!(load_store_atomic[sub_word], AtomicI16, i16, 2); +#[cfg(not(target_arch = "avr"))] atomic_int!(load_store_atomic[sub_word], AtomicU16, u16, 2); +#[cfg(target_arch = "avr")] +atomic_int!(all_critical_session, AtomicI16, i16, 2); +#[cfg(target_arch = "avr")] +atomic_int!(all_critical_session, AtomicU16, u16, 2); #[cfg(not(target_pointer_width = "16"))] atomic_int!(load_store_atomic, AtomicI32, i32, 4); diff --git a/src/imp/interrupt/msp430.rs b/src/imp/interrupt/msp430.rs index a60b8515..a7792f7a 100644 --- a/src/imp/interrupt/msp430.rs +++ b/src/imp/interrupt/msp430.rs @@ -5,7 +5,9 @@ Adapted from https://github.com/rust-embedded/msp430. See also src/imp/msp430.rs. -Refs: https://www.ti.com/lit/ug/slau208q/slau208q.pdf +Refs: +- MSP430x5xx and MSP430x6xx Family User's Guide, Rev. Q + https://www.ti.com/lit/ug/slau208q/slau208q.pdf Generated asm: - msp430 https://godbolt.org/z/fc6h89xac @@ -27,11 +29,12 @@ pub(super) fn disable() -> State { unsafe { // Do not use `nomem` and `readonly` because prevent subsequent memory accesses from being reordered before interrupts are disabled. // Do not use `preserves_flags` because DINT modifies the GIE (global interrupt enable) bit of the status register. + // See "NOTE: Enable and Disable Interrupt" of User's Guide for NOP: https://www.ti.com/lit/ug/slau208q/slau208q.pdf#page=60 #[cfg(not(portable_atomic_no_asm))] asm!( - "mov r2, {0}", - "dint {{ nop", - out(reg) sr, + "mov r2, {sr}", // sr = SR + "dint {{ nop", // SR.GIE = 0 + sr = out(reg) sr, options(nostack), ); #[cfg(portable_atomic_no_asm)] @@ -49,7 +52,7 @@ pub(super) fn disable() -> State { /// /// The state must be the one retrieved by the previous `disable`. #[inline(always)] -pub(super) unsafe fn restore(sr: State) { +pub(super) unsafe fn restore(prev_sr: State) { // SAFETY: the caller must guarantee that the state was retrieved by the previous `disable`, unsafe { // This clobbers the entire status register, but we never explicitly modify @@ -61,9 +64,14 @@ pub(super) unsafe fn restore(sr: State) { // // Do not use `nomem` and `readonly` because prevent preceding memory accesses from being reordered after interrupts are enabled. // Do not use `preserves_flags` because MOV modifies the status register. + // See "NOTE: Enable and Disable Interrupt" of User's Guide for NOP: https://www.ti.com/lit/ug/slau208q/slau208q.pdf#page=60 #[cfg(not(portable_atomic_no_asm))] - asm!("nop {{ mov {0}, r2 {{ nop", in(reg) sr, options(nostack)); + asm!( + "nop {{ mov {prev_sr}, r2 {{ nop", // SR = prev_sr + prev_sr = in(reg) prev_sr, + options(nostack), + ); #[cfg(portable_atomic_no_asm)] - llvm_asm!("nop { mov $0, r2 { nop" :: "r"(sr) : "memory" : "volatile"); + llvm_asm!("nop { mov $0, r2 { nop" :: "r"(prev_sr) : "memory" : "volatile"); } } diff --git a/src/imp/interrupt/riscv.rs b/src/imp/interrupt/riscv.rs index 64c8e25a..affe417a 100644 --- a/src/imp/interrupt/riscv.rs +++ b/src/imp/interrupt/riscv.rs @@ -2,8 +2,11 @@ /* Refs: -- https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/machine.adoc#machine-status-mstatus-and-mstatush-registers -- https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/supervisor.adoc#supervisor-status-sstatus-register +- RISC-V Instruction Set Manual + Machine Status (mstatus and mstatush) Registers + https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/machine.adoc#machine-status-mstatus-and-mstatush-registers + Supervisor Status (sstatus) Register + https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/supervisor.adoc#supervisor-status-sstatus-register See also src/imp/riscv.rs. @@ -62,7 +65,11 @@ pub(super) fn disable() -> State { // (see module-level comments of interrupt/mod.rs on the safety of using privileged instructions) unsafe { // Do not use `nomem` and `readonly` because prevent subsequent memory accesses from being reordered before interrupts are disabled. - asm!(concat!("csrrci {0}, ", status!(), ", ", mask!()), out(reg) status, options(nostack, preserves_flags)); + asm!( + concat!("csrrci {status}, ", status!(), ", ", mask!()), // atomic { status = status!(); status!() &= !mask!() } + status = out(reg) status, + options(nostack, preserves_flags), + ); } status } @@ -79,7 +86,10 @@ pub(super) unsafe fn restore(status: State) { // and we've checked that interrupts were enabled before disabling interrupts. unsafe { // Do not use `nomem` and `readonly` because prevent preceding memory accesses from being reordered after interrupts are enabled. - asm!(concat!("csrsi ", status!(), ", ", mask!()), options(nostack, preserves_flags)); + asm!( + concat!("csrsi ", status!(), ", ", mask!()), // atomic { status!() |= mask!() } + options(nostack, preserves_flags), + ); } } } diff --git a/src/imp/mod.rs b/src/imp/mod.rs index 1f3d229d..dcc642ab 100644 --- a/src/imp/mod.rs +++ b/src/imp/mod.rs @@ -37,6 +37,12 @@ )] mod core_atomic; +// AVR +#[cfg(target_arch = "avr")] +#[cfg(not(portable_atomic_no_asm))] +#[cfg(not(feature = "critical-section"))] +mod avr; + // MSP430 #[cfg(target_arch = "msp430")] pub(crate) mod msp430; diff --git a/src/imp/msp430.rs b/src/imp/msp430.rs index 92f3f28c..2bc538b8 100644 --- a/src/imp/msp430.rs +++ b/src/imp/msp430.rs @@ -1,24 +1,32 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT /* -Atomic load/store implementation on MSP430. +Atomic implementation on MSP430. Adapted from https://github.com/pftbest/msp430-atomic. Operations not supported here are provided by disabling interrupts. See also src/imp/interrupt/msp430.rs. +See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and +detailed description of the atomic and synchronize instructions in this architecture: +https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#msp430 + Note: Ordering is always SeqCst. -Refs: https://www.ti.com/lit/ug/slau208q/slau208q.pdf +Refs: +- MSP430x5xx and MSP430x6xx Family User's Guide, Rev. Q + https://www.ti.com/lit/ug/slau208q/slau208q.pdf +- atomic-maybe-uninit + https://github.com/taiki-e/atomic-maybe-uninit Generated asm: -- msp430 https://godbolt.org/z/jaodMM4KM +- msp430 https://godbolt.org/z/MGrd4jPoq */ #[cfg(not(portable_atomic_no_asm))] use core::arch::asm; -#[cfg(any(test, not(feature = "critical-section")))] +#[cfg(not(feature = "critical-section"))] use core::cell::UnsafeCell; use core::sync::atomic::Ordering; @@ -60,37 +68,23 @@ pub fn compiler_fence(order: Ordering) { } macro_rules! atomic { - (load_store, $([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $asm_suffix:tt) => { - #[cfg(any(test, not(feature = "critical-section")))] + (load_store, $([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $size:tt) => { + #[cfg(not(feature = "critical-section"))] #[repr(transparent)] pub(crate) struct $atomic_type $(<$($generics)*>)? { v: UnsafeCell<$value_type>, } - #[cfg(any(test, not(feature = "critical-section")))] + #[cfg(not(feature = "critical-section"))] // Send is implicitly implemented for atomic integers, but not for atomic pointers. // SAFETY: any data races are prevented by atomic operations. unsafe impl $(<$($generics)*>)? Send for $atomic_type $(<$($generics)*>)? {} - #[cfg(any(test, not(feature = "critical-section")))] + #[cfg(not(feature = "critical-section"))] // SAFETY: any data races are prevented by atomic operations. unsafe impl $(<$($generics)*>)? Sync for $atomic_type $(<$($generics)*>)? {} - #[cfg(any(test, not(feature = "critical-section")))] + #[cfg(not(feature = "critical-section"))] impl $(<$($generics)*>)? $atomic_type $(<$($generics)*>)? { - #[cfg(test)] - #[inline] - pub(crate) const fn new(v: $value_type) -> Self { - Self { v: UnsafeCell::new(v) } - } - - #[cfg(test)] - #[inline] - pub(crate) fn is_lock_free() -> bool { - Self::IS_ALWAYS_LOCK_FREE - } - #[cfg(test)] - pub(crate) const IS_ALWAYS_LOCK_FREE: bool = true; - #[inline] #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)] pub(crate) fn load(&self, order: Ordering) -> $value_type { @@ -102,14 +96,14 @@ macro_rules! atomic { let out; #[cfg(not(portable_atomic_no_asm))] asm!( - concat!("mov", $asm_suffix, " @{src}, {out}"), + concat!("mov.", $size, " @{src}, {out}"), // atomic { out = *src } src = in(reg) src, out = lateout(reg) out, options(nostack, preserves_flags), ); #[cfg(portable_atomic_no_asm)] llvm_asm!( - concat!("mov", $asm_suffix, " $1, $0") + concat!("mov.", $size, " $1, $0") : "=r"(out) : "*m"(src) : "memory" : "volatile" ); out @@ -126,23 +120,23 @@ macro_rules! atomic { unsafe { #[cfg(not(portable_atomic_no_asm))] asm!( - concat!("mov", $asm_suffix, " {val}, 0({dst})"), + concat!("mov.", $size, " {val}, 0({dst})"), // atomic { *dst = val } dst = in(reg) dst, val = in(reg) val, options(nostack, preserves_flags), ); #[cfg(portable_atomic_no_asm)] llvm_asm!( - concat!("mov", $asm_suffix, " $1, $0") + concat!("mov.", $size, " $1, $0") :: "*m"(dst), "ir"(val) : "memory" : "volatile" ); } } } }; - ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $asm_suffix:tt) => { - atomic!(load_store, $([$($generics)*])? $atomic_type, $value_type, $asm_suffix); - #[cfg(any(test, not(feature = "critical-section")))] + ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $size:tt) => { + atomic!(load_store, $([$($generics)*])? $atomic_type, $value_type, $size); + #[cfg(not(feature = "critical-section"))] impl $(<$($generics)*>)? $atomic_type $(<$($generics)*>)? { #[inline] pub(crate) fn add(&self, val: $value_type, _order: Ordering) { @@ -152,7 +146,7 @@ macro_rules! atomic { unsafe { #[cfg(not(portable_atomic_no_asm))] asm!( - concat!("add", $asm_suffix, " {val}, 0({dst})"), + concat!("add.", $size, " {val}, 0({dst})"), // atomic { *dst += val } dst = in(reg) dst, val = in(reg) val, // Do not use `preserves_flags` because ADD modifies the V, N, Z, and C bits of the status register. @@ -160,7 +154,7 @@ macro_rules! atomic { ); #[cfg(portable_atomic_no_asm)] llvm_asm!( - concat!("add", $asm_suffix, " $1, $0") + concat!("add.", $size, " $1, $0") :: "*m"(dst), "ir"(val) : "memory" : "volatile" ); } @@ -174,7 +168,7 @@ macro_rules! atomic { unsafe { #[cfg(not(portable_atomic_no_asm))] asm!( - concat!("sub", $asm_suffix, " {val}, 0({dst})"), + concat!("sub.", $size, " {val}, 0({dst})"), // atomic { *dst -= val } dst = in(reg) dst, val = in(reg) val, // Do not use `preserves_flags` because SUB modifies the V, N, Z, and C bits of the status register. @@ -182,7 +176,7 @@ macro_rules! atomic { ); #[cfg(portable_atomic_no_asm)] llvm_asm!( - concat!("sub", $asm_suffix, " $1, $0") + concat!("sub.", $size, " $1, $0") :: "*m"(dst), "ir"(val) : "memory" : "volatile" ); } @@ -196,7 +190,7 @@ macro_rules! atomic { unsafe { #[cfg(not(portable_atomic_no_asm))] asm!( - concat!("and", $asm_suffix, " {val}, 0({dst})"), + concat!("and.", $size, " {val}, 0({dst})"), // atomic { *dst &= val } dst = in(reg) dst, val = in(reg) val, // Do not use `preserves_flags` because AND modifies the V, N, Z, and C bits of the status register. @@ -204,7 +198,7 @@ macro_rules! atomic { ); #[cfg(portable_atomic_no_asm)] llvm_asm!( - concat!("and", $asm_suffix, " $1, $0") + concat!("and.", $size, " $1, $0") :: "*m"(dst), "ir"(val) : "memory" : "volatile" ); } @@ -218,14 +212,14 @@ macro_rules! atomic { unsafe { #[cfg(not(portable_atomic_no_asm))] asm!( - concat!("bis", $asm_suffix, " {val}, 0({dst})"), + concat!("bis.", $size, " {val}, 0({dst})"), // atomic { *dst |= val } dst = in(reg) dst, val = in(reg) val, options(nostack, preserves_flags), ); #[cfg(portable_atomic_no_asm)] llvm_asm!( - concat!("bis", $asm_suffix, " $1, $0") + concat!("bis.", $size, " $1, $0") :: "*m"(dst), "ir"(val) : "memory" : "volatile" ); } @@ -239,7 +233,7 @@ macro_rules! atomic { unsafe { #[cfg(not(portable_atomic_no_asm))] asm!( - concat!("xor", $asm_suffix, " {val}, 0({dst})"), + concat!("xor.", $size, " {val}, 0({dst})"), // atomic { *dst ^= val } dst = in(reg) dst, val = in(reg) val, // Do not use `preserves_flags` because XOR modifies the V, N, Z, and C bits of the status register. @@ -247,7 +241,7 @@ macro_rules! atomic { ); #[cfg(portable_atomic_no_asm)] llvm_asm!( - concat!("xor", $asm_suffix, " $1, $0") + concat!("xor.", $size, " $1, $0") :: "*m"(dst), "ir"(val) : "memory" : "volatile" ); } @@ -261,26 +255,26 @@ macro_rules! atomic { unsafe { #[cfg(not(portable_atomic_no_asm))] asm!( - concat!("inv", $asm_suffix, " 0({dst})"), + concat!("inv.", $size, " 0({dst})"), // atomic { *dst = !*dst } dst = in(reg) dst, // Do not use `preserves_flags` because INV modifies the V, N, Z, and C bits of the status register. options(nostack), ); #[cfg(portable_atomic_no_asm)] llvm_asm!( - concat!("inv", $asm_suffix, " $0") + concat!("inv.", $size, " $0") :: "*m"(dst) : "memory" : "volatile" ); } } } - } + }; } -atomic!(AtomicI8, i8, ".b"); -atomic!(AtomicU8, u8, ".b"); -atomic!(AtomicI16, i16, ".w"); -atomic!(AtomicU16, u16, ".w"); -atomic!(AtomicIsize, isize, ".w"); -atomic!(AtomicUsize, usize, ".w"); -atomic!(load_store, [T] AtomicPtr, *mut T, ".w"); +atomic!(AtomicI8, i8, "b"); +atomic!(AtomicU8, u8, "b"); +atomic!(AtomicI16, i16, "w"); +atomic!(AtomicU16, u16, "w"); +atomic!(AtomicIsize, isize, "w"); +atomic!(AtomicUsize, usize, "w"); +atomic!(load_store, [T] AtomicPtr, *mut T, "w"); diff --git a/src/imp/riscv.rs b/src/imp/riscv.rs index 178494bf..aba0ac9a 100644 --- a/src/imp/riscv.rs +++ b/src/imp/riscv.rs @@ -6,7 +6,11 @@ Atomic load/store implementation on RISC-V. This is for RISC-V targets without atomic CAS. (rustc doesn't provide atomics at all on such targets. https://github.com/rust-lang/rust/pull/114499) -Also, optionally provides RMW implementation when force-amo or Zaamo target feature is enabled. +Also, optionally provides RMW implementation when Zaamo extension or force-amo feature is enabled. + +See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and +detailed description of the atomic and synchronize instructions in this architecture: +https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#risc-v Refs: - RISC-V Instruction Set Manual @@ -16,7 +20,8 @@ Refs: https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/zabha.adoc - RISC-V Atomics ABI Specification https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/draft-20240829-13bfa9f54634cb60d86b9b333e109f077805b4b3/riscv-atomic.adoc -- atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit +- atomic-maybe-uninit + https://github.com/taiki-e/atomic-maybe-uninit Generated asm: - riscv64gc https://godbolt.org/z/Ws933n9jE @@ -86,7 +91,7 @@ macro_rules! atomic_rmw_amo_ext { portable_atomic_target_feature = "zaamo", ))] macro_rules! atomic_rmw_amo { - ($op:ident, $dst:ident, $val:ident, $order:ident, $asm_suffix:tt) => {{ + ($op:ident, $dst:ident, $val:ident, $order:ident, $size:tt) => {{ let out; macro_rules! op { ($asm_order:tt) => { @@ -97,12 +102,12 @@ macro_rules! atomic_rmw_amo { asm!( ".option push", // https://github.com/riscv-non-isa/riscv-asm-manual/blob/ad0de8c004e29c9a7ac33cfd054f4d4f9392f2fb/src/asm-manual.adoc#arch - // LLVM supports `.option arch` directive on LLVM 17+, so use .insn directive on old LLVM. + // LLVM supports `.option arch` directive on LLVM 17+. // https://github.com/llvm/llvm-project/commit/9e8ed3403c191ab9c4903e8eeb8f732ff8a43cb4 // Note that `.insn ` directive requires LLVM 19. // https://github.com/llvm/llvm-project/commit/2a086dce691e3cc34a2fc27f4fb255bb2cbbfac9 - concat!(".option arch, ", atomic_rmw_amo_ext!($asm_suffix)), - concat!("amo", stringify!($op), ".", $asm_suffix, $asm_order, " {out}, {val}, 0({dst})"), + concat!(".option arch, ", atomic_rmw_amo_ext!($size)), + concat!("amo", stringify!($op), ".", $size, $asm_order, " {out}, {val}, 0({dst})"), // atomic { _x = *dst; *dst = op(_x, val); out = _x } ".option pop", dst = in(reg) ptr_reg!($dst), val = in(reg) $val, @@ -136,7 +141,7 @@ fn sllw(val: u32, shift: u32) -> u32 { unsafe { let out; asm!( - concat!("sll", w!(), " {out}, {val}, {shift}"), + concat!("sll", w!(), " {out}, {val}, {shift}"), // out = val << shift & 31 out = lateout(reg) out, val = in(reg) val, shift = in(reg) shift, @@ -161,7 +166,7 @@ macro_rules! srlw { let shift: u32 = $shift; let out; asm!( - concat!("srl", w!(), " {out}, {val}, {shift}"), + concat!("srl", w!(), " {out}, {val}, {shift}"), // out = val >> shift & 31 out = lateout(reg) out, val = in(reg) val, shift = in(reg) shift, @@ -173,7 +178,7 @@ macro_rules! srlw { } macro_rules! atomic_load_store { - ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $asm_suffix:tt) => { + ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $size:tt) => { #[repr(transparent)] pub(crate) struct $atomic_type $(<$($generics)*>)? { v: UnsafeCell<$value_type>, @@ -216,9 +221,9 @@ macro_rules! atomic_load_store { macro_rules! atomic_load { ($acquire:tt, $release:tt) => { asm!( - $release, - concat!("l", $asm_suffix, " {out}, 0({src})"), - $acquire, + $release, // fence + concat!("l", $size, " {out}, 0({src})"), // atomic { out = *src } + $acquire, // fence src = in(reg) ptr_reg!(src), out = lateout(reg) out, options(nostack, preserves_flags), @@ -246,9 +251,9 @@ macro_rules! atomic_load_store { macro_rules! atomic_store { ($acquire:tt, $release:tt) => { asm!( - $release, - concat!("s", $asm_suffix, " {val}, 0({dst})"), - $acquire, + $release, // fence + concat!("s", $size, " {val}, 0({dst})"), // atomic { *dst = val } + $acquire, // fence dst = in(reg) ptr_reg!(dst), val = in(reg) val, options(nostack, preserves_flags), @@ -269,8 +274,8 @@ macro_rules! atomic_load_store { } macro_rules! atomic_ptr { - ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $asm_suffix:tt) => { - atomic_load_store!($([$($generics)*])? $atomic_type, $value_type, $asm_suffix); + ($([$($generics:tt)*])? $atomic_type:ident, $value_type:ty, $size:tt) => { + atomic_load_store!($([$($generics)*])? $atomic_type, $value_type, $size); #[cfg(any( test, portable_atomic_force_amo, @@ -283,15 +288,15 @@ macro_rules! atomic_ptr { let dst = self.v.get(); // SAFETY: any data races are prevented by atomic intrinsics and the raw // pointer passed in is valid because we got it from a reference. - unsafe { atomic_rmw_amo!(swap, dst, val, order, $asm_suffix) } + unsafe { atomic_rmw_amo!(swap, dst, val, order, $size) } } } }; } macro_rules! atomic { - ($atomic_type:ident, $value_type:ty, $asm_suffix:tt, $max:tt, $min:tt) => { - atomic_load_store!($atomic_type, $value_type, $asm_suffix); + ($atomic_type:ident, $value_type:ty, $size:tt, $max:tt, $min:tt) => { + atomic_load_store!($atomic_type, $value_type, $size); #[cfg(any( test, portable_atomic_force_amo, @@ -321,7 +326,7 @@ macro_rules! atomic { let dst = self.v.get(); // SAFETY: any data races are prevented by atomic intrinsics and the raw // pointer passed in is valid because we got it from a reference. - unsafe { atomic_rmw_amo!(swap, dst, val, order, $asm_suffix) } + unsafe { atomic_rmw_amo!(swap, dst, val, order, $size) } } #[inline] @@ -329,7 +334,7 @@ macro_rules! atomic { let dst = self.v.get(); // SAFETY: any data races are prevented by atomic intrinsics and the raw // pointer passed in is valid because we got it from a reference. - unsafe { atomic_rmw_amo!(add, dst, val, order, $asm_suffix) } + unsafe { atomic_rmw_amo!(add, dst, val, order, $size) } } #[inline] @@ -342,7 +347,7 @@ macro_rules! atomic { let dst = self.v.get(); // SAFETY: any data races are prevented by atomic intrinsics and the raw // pointer passed in is valid because we got it from a reference. - unsafe { atomic_rmw_amo!(and, dst, val, order, $asm_suffix) } + unsafe { atomic_rmw_amo!(and, dst, val, order, $size) } } #[inline] @@ -350,7 +355,7 @@ macro_rules! atomic { let dst = self.v.get(); // SAFETY: any data races are prevented by atomic intrinsics and the raw // pointer passed in is valid because we got it from a reference. - unsafe { atomic_rmw_amo!(or, dst, val, order, $asm_suffix) } + unsafe { atomic_rmw_amo!(or, dst, val, order, $size) } } #[inline] @@ -358,7 +363,7 @@ macro_rules! atomic { let dst = self.v.get(); // SAFETY: any data races are prevented by atomic intrinsics and the raw // pointer passed in is valid because we got it from a reference. - unsafe { atomic_rmw_amo!(xor, dst, val, order, $asm_suffix) } + unsafe { atomic_rmw_amo!(xor, dst, val, order, $size) } } #[inline] @@ -370,7 +375,7 @@ macro_rules! atomic { let val: u64 = !0; // SAFETY: any data races are prevented by atomic intrinsics and the raw // pointer passed in is valid because we got it from a reference. - unsafe { atomic_rmw_amo!(xor, dst, val, order, $asm_suffix) } + unsafe { atomic_rmw_amo!(xor, dst, val, order, $size) } } #[cfg(not(any( portable_atomic_unsafe_assume_single_core, @@ -386,7 +391,7 @@ macro_rules! atomic { let dst = self.v.get(); // SAFETY: any data races are prevented by atomic intrinsics and the raw // pointer passed in is valid because we got it from a reference. - unsafe { atomic_rmw_amo!($max, dst, val, order, $asm_suffix) } + unsafe { atomic_rmw_amo!($max, dst, val, order, $size) } } #[inline] @@ -394,7 +399,7 @@ macro_rules! atomic { let dst = self.v.get(); // SAFETY: any data races are prevented by atomic intrinsics and the raw // pointer passed in is valid because we got it from a reference. - unsafe { atomic_rmw_amo!($min, dst, val, order, $asm_suffix) } + unsafe { atomic_rmw_amo!($min, dst, val, order, $size) } } } }; @@ -446,11 +451,11 @@ zero_extend!(i8, u8); zero_extend!(i16, u16); macro_rules! atomic_sub_word { - ($atomic_type:ident, $value_type:ty, $asm_suffix:tt, $max:tt, $min:tt) => { + ($atomic_type:ident, $value_type:ty, $size:tt, $max:tt, $min:tt) => { #[cfg(any(target_feature = "zabha", portable_atomic_target_feature = "zabha"))] - atomic!($atomic_type, $value_type, $asm_suffix, $max, $min); + atomic!($atomic_type, $value_type, $size, $max, $min); #[cfg(not(any(target_feature = "zabha", portable_atomic_target_feature = "zabha")))] - atomic_load_store!($atomic_type, $value_type, $asm_suffix); + atomic_load_store!($atomic_type, $value_type, $size); #[cfg(any( test, portable_atomic_force_amo,