diff --git a/compiler/rustc_codegen_cranelift/Readme.md b/compiler/rustc_codegen_cranelift/Readme.md index 5664cbe7d4fac..1a2b2bbc58812 100644 --- a/compiler/rustc_codegen_cranelift/Readme.md +++ b/compiler/rustc_codegen_cranelift/Readme.md @@ -76,8 +76,6 @@ configuration options. ## Not yet supported -* Inline assembly ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1041)) - * On UNIX there is support for invoking an external assembler for `global_asm!` and `asm!`. * SIMD ([tracked here](https://github.com/rust-lang/rustc_codegen_cranelift/issues/171), `std::simd` fully works, `std::arch` is partially supported) * Unwinding on panics ([no cranelift support](https://github.com/bytecodealliance/wasmtime/issues/1677), `-Cpanic=abort` is enabled by default) diff --git a/compiler/rustc_codegen_cranelift/build_system/tests.rs b/compiler/rustc_codegen_cranelift/build_system/tests.rs index 1e24d1b113fe6..10736ff9a55c8 100644 --- a/compiler/rustc_codegen_cranelift/build_system/tests.rs +++ b/compiler/rustc_codegen_cranelift/build_system/tests.rs @@ -99,6 +99,7 @@ const BASE_SYSROOT_SUITE: &[TestCase] = &[ TestCase::build_bin_and_run("aot.mod_bench", "example/mod_bench.rs", &[]), TestCase::build_bin_and_run("aot.issue-72793", "example/issue-72793.rs", &[]), TestCase::build_bin("aot.issue-59326", "example/issue-59326.rs"), + TestCase::build_bin_and_run("aot.neon", "example/neon.rs", &[]), ]; pub(crate) static RAND_REPO: GitRepo = GitRepo::github( diff --git a/compiler/rustc_codegen_cranelift/config.txt b/compiler/rustc_codegen_cranelift/config.txt index 7ff805e58d967..2ccdc7d78748a 100644 --- a/compiler/rustc_codegen_cranelift/config.txt +++ b/compiler/rustc_codegen_cranelift/config.txt @@ -42,6 +42,7 @@ aot.float-minmax-pass aot.mod_bench aot.issue-72793 aot.issue-59326 +aot.neon testsuite.extended_sysroot test.rust-random/rand diff --git a/compiler/rustc_codegen_cranelift/example/neon.rs b/compiler/rustc_codegen_cranelift/example/neon.rs new file mode 100644 index 0000000000000..bad26947967dc --- /dev/null +++ b/compiler/rustc_codegen_cranelift/example/neon.rs @@ -0,0 +1,234 @@ +// Most of these tests are copied from https://github.com/japaric/stdsimd/blob/0f4413d01c4f0c3ffbc5a69e9a37fbc7235b31a9/coresimd/arm/neon.rs + +#![feature(portable_simd)] + +#[cfg(target_arch = "aarch64")] +use std::arch::aarch64::*; +use std::mem::transmute; +use std::simd::*; + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_s8() { + let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]); + let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]); + let e = i8x8::from([-2, -4, 5, 7, 0, 2, 4, 6]); + let r: i8x8 = transmute(vpmin_s8(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_s16() { + let a = i16x4::from([1, 2, 3, -4]); + let b = i16x4::from([0, 3, 2, 5]); + let e = i16x4::from([1, -4, 0, 2]); + let r: i16x4 = transmute(vpmin_s16(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_s32() { + let a = i32x2::from([1, -2]); + let b = i32x2::from([0, 3]); + let e = i32x2::from([-2, 0]); + let r: i32x2 = transmute(vpmin_s32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_u8() { + let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]); + let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]); + let e = u8x8::from([1, 3, 5, 7, 0, 2, 4, 6]); + let r: u8x8 = transmute(vpmin_u8(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_u16() { + let a = u16x4::from([1, 2, 3, 4]); + let b = u16x4::from([0, 3, 2, 5]); + let e = u16x4::from([1, 3, 0, 2]); + let r: u16x4 = transmute(vpmin_u16(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_u32() { + let a = u32x2::from([1, 2]); + let b = u32x2::from([0, 3]); + let e = u32x2::from([1, 0]); + let r: u32x2 = transmute(vpmin_u32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmin_f32() { + let a = f32x2::from([1., -2.]); + let b = f32x2::from([0., 3.]); + let e = f32x2::from([-2., 0.]); + let r: f32x2 = transmute(vpmin_f32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_s8() { + let a = i8x8::from([1, -2, 3, -4, 5, 6, 7, 8]); + let b = i8x8::from([0, 3, 2, 5, 4, 7, 6, 9]); + let e = i8x8::from([1, 3, 6, 8, 3, 5, 7, 9]); + let r: i8x8 = transmute(vpmax_s8(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_s16() { + let a = i16x4::from([1, 2, 3, -4]); + let b = i16x4::from([0, 3, 2, 5]); + let e = i16x4::from([2, 3, 3, 5]); + let r: i16x4 = transmute(vpmax_s16(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_s32() { + let a = i32x2::from([1, -2]); + let b = i32x2::from([0, 3]); + let e = i32x2::from([1, 3]); + let r: i32x2 = transmute(vpmax_s32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_u8() { + let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]); + let b = u8x8::from([0, 3, 2, 5, 4, 7, 6, 9]); + let e = u8x8::from([2, 4, 6, 8, 3, 5, 7, 9]); + let r: u8x8 = transmute(vpmax_u8(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_u16() { + let a = u16x4::from([1, 2, 3, 4]); + let b = u16x4::from([0, 3, 2, 5]); + let e = u16x4::from([2, 4, 3, 5]); + let r: u16x4 = transmute(vpmax_u16(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_u32() { + let a = u32x2::from([1, 2]); + let b = u32x2::from([0, 3]); + let e = u32x2::from([2, 3]); + let r: u32x2 = transmute(vpmax_u32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpmax_f32() { + let a = f32x2::from([1., -2.]); + let b = f32x2::from([0., 3.]); + let e = f32x2::from([1., 3.]); + let r: f32x2 = transmute(vpmax_f32(transmute(a), transmute(b))); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_s16() { + let a = i16x4::from([1, 2, 3, 4]); + let b = i16x4::from([0, -1, -2, -3]); + let r: i16x4 = transmute(vpadd_s16(transmute(a), transmute(b))); + let e = i16x4::from([3, 7, -1, -5]); + assert_eq!(r, e); +} +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_s32() { + let a = i32x2::from([1, 2]); + let b = i32x2::from([0, -1]); + let r: i32x2 = transmute(vpadd_s32(transmute(a), transmute(b))); + let e = i32x2::from([3, -1]); + assert_eq!(r, e); +} +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_s8() { + let a = i8x8::from([1, 2, 3, 4, 5, 6, 7, 8]); + let b = i8x8::from([0, -1, -2, -3, -4, -5, -6, -7]); + let r: i8x8 = transmute(vpadd_s8(transmute(a), transmute(b))); + let e = i8x8::from([3, 7, 11, 15, -1, -5, -9, -13]); + assert_eq!(r, e); +} +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_u16() { + let a = u16x4::from([1, 2, 3, 4]); + let b = u16x4::from([30, 31, 32, 33]); + let r: u16x4 = transmute(vpadd_u16(transmute(a), transmute(b))); + let e = u16x4::from([3, 7, 61, 65]); + assert_eq!(r, e); +} +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_u32() { + let a = u32x2::from([1, 2]); + let b = u32x2::from([30, 31]); + let r: u32x2 = transmute(vpadd_u32(transmute(a), transmute(b))); + let e = u32x2::from([3, 61]); + assert_eq!(r, e); +} +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpadd_u8() { + let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 8]); + let b = u8x8::from([30, 31, 32, 33, 34, 35, 36, 37]); + let r: u8x8 = transmute(vpadd_u8(transmute(a), transmute(b))); + let e = u8x8::from([3, 7, 11, 15, 61, 65, 69, 73]); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vqsub_u8() { + let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]); + let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]); + let r: u8x8 = transmute(vqsub_u8(transmute(a), transmute(b))); + let e = u8x8::from([0, 1, 2, 3, 0, 0, 0, 218]); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vqadd_u8() { + let a = u8x8::from([1, 2, 3, 4, 5, 6, 7, 0xff]); + let b = u8x8::from([30, 1, 1, 1, 34, 0xff, 36, 37]); + let r: u8x8 = transmute(vqadd_u8(transmute(a), transmute(b))); + let e = u8x8::from([31, 3, 4, 5, 39, 0xff, 43, 0xff]); + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +fn main() { + unsafe { + test_vpmin_s8(); + test_vpmin_s16(); + test_vpmin_s32(); + test_vpmin_u8(); + test_vpmin_u16(); + test_vpmin_u32(); + test_vpmin_f32(); + test_vpmax_s8(); + test_vpmax_s16(); + test_vpmax_s32(); + test_vpmax_u8(); + test_vpmax_u16(); + test_vpmax_u32(); + test_vpmax_f32(); + + test_vpadd_s16(); + test_vpadd_s32(); + test_vpadd_s8(); + test_vpadd_u16(); + test_vpadd_u32(); + test_vpadd_u8(); + + test_vqsub_u8(); + test_vqadd_u8(); + } +} + +#[cfg(not(target_arch = "aarch64"))] +fn main() {} diff --git a/compiler/rustc_codegen_cranelift/patches/stdlib-lock.toml b/compiler/rustc_codegen_cranelift/patches/stdlib-lock.toml index 9902bca8eab27..8a690bada0df5 100644 --- a/compiler/rustc_codegen_cranelift/patches/stdlib-lock.toml +++ b/compiler/rustc_codegen_cranelift/patches/stdlib-lock.toml @@ -58,9 +58,9 @@ dependencies = [ [[package]] name = "compiler_builtins" -version = "0.1.100" +version = "0.1.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6c0f24437059853f0fa64afc51f338f93647a3de4cf3358ba1bb4171a199775" +checksum = "a3b73c3443a5fd2438d7ba4853c64e4c8efc2404a9e28a9234cc2d5eebc6c242" dependencies = [ "cc", "rustc-std-workspace-core", @@ -158,9 +158,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.149" +version = "0.2.150" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" dependencies = [ "rustc-std-workspace-core", ] @@ -415,7 +415,6 @@ dependencies = [ name = "unwind" version = "0.0.0" dependencies = [ - "cc", "cfg-if", "compiler_builtins", "core", diff --git a/compiler/rustc_codegen_cranelift/rust-toolchain b/compiler/rustc_codegen_cranelift/rust-toolchain index 7e3eaacf8ef08..b832b06e0ffba 100644 --- a/compiler/rustc_codegen_cranelift/rust-toolchain +++ b/compiler/rustc_codegen_cranelift/rust-toolchain @@ -1,3 +1,3 @@ [toolchain] -channel = "nightly-2023-10-29" +channel = "nightly-2023-11-10" components = ["rust-src", "rustc-dev", "llvm-tools"] diff --git a/compiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh b/compiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh index a299b6de6b1cd..cdc78adcf85e3 100755 --- a/compiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh +++ b/compiler/rustc_codegen_cranelift/scripts/test_rustc_tests.sh @@ -146,6 +146,11 @@ rm tests/ui/process/nofile-limit.rs # TODO some AArch64 linking issue rm tests/ui/stdio-is-blocking.rs # really slow with unoptimized libstd +# rustc bugs +# ========== +# https://github.com/rust-lang/rust/pull/116447#issuecomment-1790451463 +rm tests/ui/coroutine/gen_block_*.rs + cp ../dist/bin/rustdoc-clif ../dist/bin/rustdoc # some tests expect bin/rustdoc to exist # prevent $(RUSTDOC) from picking up the sysroot built by x.py. It conflicts with the one used by diff --git a/compiler/rustc_codegen_cranelift/src/inline_asm.rs b/compiler/rustc_codegen_cranelift/src/inline_asm.rs index 331649b2ec24f..ce0eecca8a8ba 100644 --- a/compiler/rustc_codegen_cranelift/src/inline_asm.rs +++ b/compiler/rustc_codegen_cranelift/src/inline_asm.rs @@ -13,7 +13,7 @@ use crate::prelude::*; enum CInlineAsmOperand<'tcx> { In { reg: InlineAsmRegOrRegClass, - value: CValue<'tcx>, + value: Value, }, Out { reg: InlineAsmRegOrRegClass, @@ -23,7 +23,7 @@ enum CInlineAsmOperand<'tcx> { InOut { reg: InlineAsmRegOrRegClass, _late: bool, - in_value: CValue<'tcx>, + in_value: Value, out_place: Option>, }, Const { @@ -47,7 +47,9 @@ pub(crate) fn codegen_inline_asm<'tcx>( // Used by panic_abort on Windows, but uses a syntax which only happens to work with // asm!() by accident and breaks with the GNU assembler as well as global_asm!() for // the LLVM backend. - if template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string()) { + if template.len() == 1 + && template[0] == InlineAsmTemplatePiece::String("int $$0x29".to_string()) + { fx.bcx.ins().trap(TrapCode::User(1)); return; } @@ -55,9 +57,10 @@ pub(crate) fn codegen_inline_asm<'tcx>( let operands = operands .into_iter() .map(|operand| match *operand { - InlineAsmOperand::In { reg, ref value } => { - CInlineAsmOperand::In { reg, value: crate::base::codegen_operand(fx, value) } - } + InlineAsmOperand::In { reg, ref value } => CInlineAsmOperand::In { + reg, + value: crate::base::codegen_operand(fx, value).load_scalar(fx), + }, InlineAsmOperand::Out { reg, late, ref place } => CInlineAsmOperand::Out { reg, late, @@ -67,7 +70,7 @@ pub(crate) fn codegen_inline_asm<'tcx>( CInlineAsmOperand::InOut { reg, _late: late, - in_value: crate::base::codegen_operand(fx, in_value), + in_value: crate::base::codegen_operand(fx, in_value).load_scalar(fx), out_place: out_place.map(|place| crate::base::codegen_place(fx, place)), } } @@ -165,7 +168,7 @@ pub(crate) fn codegen_inline_asm<'tcx>( for (i, operand) in operands.iter().enumerate() { match operand { CInlineAsmOperand::In { reg: _, value } => { - inputs.push((asm_gen.stack_slots_input[i].unwrap(), value.load_scalar(fx))); + inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value)); } CInlineAsmOperand::Out { reg: _, late: _, place } => { if let Some(place) = place { @@ -173,7 +176,7 @@ pub(crate) fn codegen_inline_asm<'tcx>( } } CInlineAsmOperand::InOut { reg: _, _late: _, in_value, out_place } => { - inputs.push((asm_gen.stack_slots_input[i].unwrap(), in_value.load_scalar(fx))); + inputs.push((asm_gen.stack_slots_input[i].unwrap(), *in_value)); if let Some(out_place) = out_place { outputs.push((asm_gen.stack_slots_output[i].unwrap(), *out_place)); } @@ -726,3 +729,83 @@ fn call_inline_asm<'tcx>( place.write_cvalue(fx, CValue::by_val(value, place.layout())); } } + +pub(crate) fn codegen_xgetbv<'tcx>( + fx: &mut FunctionCx<'_, '_, 'tcx>, + xcr_no: Value, + ret: CPlace<'tcx>, +) { + // FIXME add .eh_frame unwind info directives + + let operands = vec![ + CInlineAsmOperand::In { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)), + value: xcr_no, + }, + CInlineAsmOperand::Out { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)), + late: true, + place: Some(ret), + }, + CInlineAsmOperand::Out { + reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)), + late: true, + place: None, + }, + ]; + let options = InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM; + + let mut inputs = Vec::new(); + let mut outputs = Vec::new(); + + let mut asm_gen = InlineAssemblyGenerator { + tcx: fx.tcx, + arch: fx.tcx.sess.asm_arch.unwrap(), + enclosing_def_id: fx.instance.def_id(), + template: &[InlineAsmTemplatePiece::String( + " + xgetbv + // out = rdx << 32 | rax + shl rdx, 32 + or rax, rdx + " + .to_string(), + )], + operands: &operands, + options, + registers: Vec::new(), + stack_slots_clobber: Vec::new(), + stack_slots_input: Vec::new(), + stack_slots_output: Vec::new(), + stack_slot_size: Size::from_bytes(0), + }; + asm_gen.allocate_registers(); + asm_gen.allocate_stack_slots(); + + let inline_asm_index = fx.cx.inline_asm_index.get(); + fx.cx.inline_asm_index.set(inline_asm_index + 1); + let asm_name = format!( + "__inline_asm_{}_n{}", + fx.cx.cgu_name.as_str().replace('.', "__").replace('-', "_"), + inline_asm_index + ); + + let generated_asm = asm_gen.generate_asm_wrapper(&asm_name); + fx.cx.global_asm.push_str(&generated_asm); + + for (i, operand) in operands.iter().enumerate() { + match operand { + CInlineAsmOperand::In { reg: _, value } => { + inputs.push((asm_gen.stack_slots_input[i].unwrap(), *value)); + } + CInlineAsmOperand::Out { reg: _, late: _, place } => { + if let Some(place) = place { + outputs.push((asm_gen.stack_slots_output[i].unwrap(), *place)); + } + } + _ => unreachable!(), + } + } + + call_inline_asm(fx, &asm_name, asm_gen.stack_slot_size, inputs, outputs); +} diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs index c169476099806..e9b7daf14924d 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm.rs @@ -51,6 +51,21 @@ pub(crate) fn codegen_llvm_intrinsic_call<'tcx>( }); } + _ if intrinsic.starts_with("llvm.fma.v") => { + intrinsic_args!(fx, args => (x,y,z); intrinsic); + + simd_trio_for_each_lane( + fx, + x, + y, + z, + ret, + &|fx, _lane_ty, _res_lane_ty, lane_x, lane_y, lane_z| { + fx.bcx.ins().fma(lane_x, lane_y, lane_z) + }, + ); + } + _ => { fx.tcx .sess diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs index 0c211a06dc4a7..ee098be1fce6b 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_aarch64.rs @@ -44,7 +44,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( }); } - _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") => { + _ if intrinsic.starts_with("llvm.aarch64.neon.sqadd.v") + || intrinsic.starts_with("llvm.aarch64.neon.uqadd.v") => + { intrinsic_args!(fx, args => (x, y); intrinsic); simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| { @@ -52,7 +54,9 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( }); } - _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") => { + _ if intrinsic.starts_with("llvm.aarch64.neon.sqsub.v") + || intrinsic.starts_with("llvm.aarch64.neon.uqsub.v") => + { intrinsic_args!(fx, args => (x, y); intrinsic); simd_pair_for_each_lane_typed(fx, x, y, ret, &|fx, x_lane, y_lane| { @@ -156,6 +160,90 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( }); } + _ if intrinsic.starts_with("llvm.aarch64.neon.umaxp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umax(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.smaxp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smax(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.uminp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().umin(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.sminp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().smin(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.fminp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmin(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.fmaxp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().fmax(x_lane, y_lane), + ); + } + + _ if intrinsic.starts_with("llvm.aarch64.neon.addp.v") => { + intrinsic_args!(fx, args => (x, y); intrinsic); + + simd_horizontal_pair_for_each_lane( + fx, + x, + y, + ret, + &|fx, _lane_ty, _res_lane_ty, x_lane, y_lane| fx.bcx.ins().iadd(x_lane, y_lane), + ); + } + // FIXME generalize vector types "llvm.aarch64.neon.tbl1.v16i8" => { intrinsic_args!(fx, args => (t, idx); intrinsic); @@ -172,25 +260,6 @@ pub(crate) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( } } - // FIXME generalize vector types - "llvm.aarch64.neon.umaxp.v16i8" => { - intrinsic_args!(fx, args => (a, b); intrinsic); - - // FIXME add helper for horizontal pairwise operations - for i in 0..8 { - let lane1 = a.value_lane(fx, i * 2).load_scalar(fx); - let lane2 = a.value_lane(fx, i * 2 + 1).load_scalar(fx); - let res = fx.bcx.ins().umax(lane1, lane2); - ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted()); - } - for i in 0..8 { - let lane1 = b.value_lane(fx, i * 2).load_scalar(fx); - let lane2 = b.value_lane(fx, i * 2 + 1).load_scalar(fx); - let res = fx.bcx.ins().umax(lane1, lane2); - ret.place_lane(fx, 8 + i).to_ptr().store(fx, res, MemFlags::trusted()); - } - } - /* _ if intrinsic.starts_with("llvm.aarch64.neon.sshl.v") || intrinsic.starts_with("llvm.aarch64.neon.sqshl.v") diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs index ea5997a14bb74..4c536048626ee 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/llvm_x86.rs @@ -20,16 +20,21 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( // Used by is_x86_feature_detected!(); "llvm.x86.xgetbv" => { - // FIXME use the actual xgetbv instruction - intrinsic_args!(fx, args => (v); intrinsic); + intrinsic_args!(fx, args => (xcr_no); intrinsic); - let v = v.load_scalar(fx); + let xcr_no = xcr_no.load_scalar(fx); - // As of writing on XCR0 exists - fx.bcx.ins().trapnz(v, TrapCode::UnreachableCodeReached); + crate::inline_asm::codegen_xgetbv(fx, xcr_no, ret); + } + + "llvm.x86.sse3.ldu.dq" | "llvm.x86.avx.ldu.dq.256" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128&ig_expand=4009 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256&ig_expand=4010 + intrinsic_args!(fx, args => (ptr); intrinsic); - let res = fx.bcx.ins().iconst(types::I64, 1 /* bit 0 must be set */); - ret.write_cvalue(fx, CValue::by_val(res, fx.layout_of(fx.tcx.types.i64))); + // FIXME correctly handle unalignedness + let val = CValue::by_ref(Pointer::new(ptr.load_scalar(fx)), ret.layout()); + ret.write_cvalue(fx, val); } "llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => { @@ -177,8 +182,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( } } } - "llvm.x86.avx2.vperm2i128" => { + "llvm.x86.avx2.vperm2i128" + | "llvm.x86.avx.vperm2f128.ps.256" + | "llvm.x86.avx.vperm2f128.pd.256" => { // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd let (a, b, imm8) = match args { [a, b, imm8] => (a, b, imm8), _ => bug!("wrong number of args for intrinsic {intrinsic}"), @@ -187,19 +196,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( let b = codegen_operand(fx, b); let imm8 = codegen_operand(fx, imm8).load_scalar(fx); - let a_0 = a.value_lane(fx, 0).load_scalar(fx); - let a_1 = a.value_lane(fx, 1).load_scalar(fx); - let a_low = fx.bcx.ins().iconcat(a_0, a_1); - let a_2 = a.value_lane(fx, 2).load_scalar(fx); - let a_3 = a.value_lane(fx, 3).load_scalar(fx); - let a_high = fx.bcx.ins().iconcat(a_2, a_3); + let a_low = a.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx); + let a_high = a.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx); - let b_0 = b.value_lane(fx, 0).load_scalar(fx); - let b_1 = b.value_lane(fx, 1).load_scalar(fx); - let b_low = fx.bcx.ins().iconcat(b_0, b_1); - let b_2 = b.value_lane(fx, 2).load_scalar(fx); - let b_3 = b.value_lane(fx, 3).load_scalar(fx); - let b_high = fx.bcx.ins().iconcat(b_2, b_3); + let b_low = b.value_typed_lane(fx, fx.tcx.types.u128, 0).load_scalar(fx); + let b_high = b.value_typed_lane(fx, fx.tcx.types.u128, 1).load_scalar(fx); fn select4( fx: &mut FunctionCx<'_, '_, '_>, @@ -224,16 +225,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( let control0 = imm8; let res_low = select4(fx, a_high, a_low, b_high, b_low, control0); - let (res_0, res_1) = fx.bcx.ins().isplit(res_low); let control1 = fx.bcx.ins().ushr_imm(imm8, 4); let res_high = select4(fx, a_high, a_low, b_high, b_low, control1); - let (res_2, res_3) = fx.bcx.ins().isplit(res_high); - ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted()); - ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted()); - ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted()); - ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted()); + ret.place_typed_lane(fx, fx.tcx.types.u128, 0).to_ptr().store( + fx, + res_low, + MemFlags::trusted(), + ); + ret.place_typed_lane(fx, fx.tcx.types.u128, 1).to_ptr().store( + fx, + res_high, + MemFlags::trusted(), + ); } "llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => { let a = match args { @@ -309,7 +314,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( fx.bcx.ins().sshr(a_lane, saturated_count) }); } - "llvm.x86.sse2.psad.bw" => { + "llvm.x86.sse2.psad.bw" | "llvm.x86.avx2.psad.bw" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8&ig_expand=5770 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8&ig_expand=5771 intrinsic_args!(fx, args => (a, b); intrinsic); assert_eq!(a.layout(), b.layout()); @@ -340,7 +347,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane); } } - "llvm.x86.ssse3.pmadd.ub.sw.128" => { + "llvm.x86.ssse3.pmadd.ub.sw.128" | "llvm.x86.avx2.pmadd.ub.sw" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16&ig_expand=4267 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16&ig_expand=4270 intrinsic_args!(fx, args => (a, b); intrinsic); let (lane_count, lane_ty) = a.layout().ty.simd_size_and_type(fx.tcx); @@ -379,7 +388,9 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane); } } - "llvm.x86.sse2.pmadd.wd" => { + "llvm.x86.sse2.pmadd.wd" | "llvm.x86.avx2.pmadd.wd" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16&ig_expand=4231 + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16&ig_expand=4234 intrinsic_args!(fx, args => (a, b); intrinsic); assert_eq!(a.layout(), b.layout()); @@ -412,6 +423,369 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane); } } + + "llvm.x86.ssse3.pmul.hr.sw.128" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16&ig_expand=4782 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i16); + assert_eq!(ret_lane_ty, fx.tcx.types.i16); + assert_eq!(lane_count, ret_lane_count); + + let ret_lane_layout = fx.layout_of(fx.tcx.types.i16); + for out_lane_idx in 0..lane_count { + let a_lane = a.value_lane(fx, out_lane_idx).load_scalar(fx); + let a_lane = fx.bcx.ins().sextend(types::I32, a_lane); + let b_lane = b.value_lane(fx, out_lane_idx).load_scalar(fx); + let b_lane = fx.bcx.ins().sextend(types::I32, b_lane); + + let mul: Value = fx.bcx.ins().imul(a_lane, b_lane); + let shifted = fx.bcx.ins().ushr_imm(mul, 14); + let incremented = fx.bcx.ins().iadd_imm(shifted, 1); + let shifted_again = fx.bcx.ins().ushr_imm(incremented, 1); + + let res_lane = fx.bcx.ins().ireduce(types::I16, shifted_again); + let res_lane = CValue::by_val(res_lane, ret_lane_layout); + + ret.place_lane(fx, out_lane_idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.sse2.packuswb.128" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i16); + assert_eq!(ret_lane_ty, fx.tcx.types.u8); + assert_eq!(lane_count * 2, ret_lane_count); + + let zero = fx.bcx.ins().iconst(types::I16, 0); + let max_u8 = fx.bcx.ins().iconst(types::I16, 255); + let ret_lane_layout = fx.layout_of(fx.tcx.types.u8); + + for idx in 0..lane_count { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.avx2.packuswb" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i16); + assert_eq!(ret_lane_ty, fx.tcx.types.u8); + assert_eq!(lane_count * 2, ret_lane_count); + + let zero = fx.bcx.ins().iconst(types::I16, 0); + let max_u8 = fx.bcx.ins().iconst(types::I16, 255); + let ret_lane_layout = fx.layout_of(fx.tcx.types.u8); + + for idx in 0..lane_count / 2 { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, zero); + let sat = fx.bcx.ins().umin(sat, max_u8); + let res = fx.bcx.ins().ireduce(types::I8, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.sse2.packssdw.128" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i32); + assert_eq!(ret_lane_ty, fx.tcx.types.i16); + assert_eq!(lane_count * 2, ret_lane_count); + + let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16)); + let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16)); + let ret_lane_layout = fx.layout_of(fx.tcx.types.i16); + + for idx in 0..lane_count { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.sse41.packusdw" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i32); + assert_eq!(ret_lane_ty, fx.tcx.types.u16); + assert_eq!(lane_count * 2, ret_lane_count); + + let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN)); + let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX)); + let ret_lane_layout = fx.layout_of(fx.tcx.types.u16); + + for idx in 0..lane_count { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().umax(lane, min_u16); + let sat = fx.bcx.ins().umin(sat, max_u16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().umax(lane, min_u16); + let sat = fx.bcx.ins().umin(sat, max_u16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.avx2.packssdw" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i32); + assert_eq!(ret_lane_ty, fx.tcx.types.i16); + assert_eq!(lane_count * 2, ret_lane_count); + + let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16)); + let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16)); + let ret_lane_layout = fx.layout_of(fx.tcx.types.i16); + + for idx in 0..lane_count / 2 { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = a.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane); + } + + for idx in 0..lane_count / 2 { + let lane = b.value_lane(fx, idx).load_scalar(fx); + let sat = fx.bcx.ins().smax(lane, min_i16); + let sat = fx.bcx.ins().umin(sat, max_i16); + let res = fx.bcx.ins().ireduce(types::I16, sat); + + let res_lane = CValue::by_val(res, ret_lane_layout); + ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane); + } + } + + "llvm.x86.pclmulqdq" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772 + intrinsic_args!(fx, args => (a, b, imm8); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i64); + assert_eq!(ret_lane_ty, fx.tcx.types.i64); + assert_eq!(lane_count, 2); + assert_eq!(ret_lane_count, 2); + + let imm8 = imm8.load_scalar(fx); + + let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001); + let a_lane0 = a.value_lane(fx, 0).load_scalar(fx); + let a_lane1 = a.value_lane(fx, 1).load_scalar(fx); + let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0); + + let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000); + let b_lane0 = b.value_lane(fx, 0).load_scalar(fx); + let b_lane1 = b.value_lane(fx, 1).load_scalar(fx); + let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0); + + fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value { + let tmp = fx.bcx.ins().ushr_imm(val, bit); + fx.bcx.ins().band_imm(tmp, 1) + } + + let mut res1 = fx.bcx.ins().iconst(types::I64, 0); + for i in 0..=63 { + let x = extract_bit(fx, temp1, 0); + let y = extract_bit(fx, temp2, i); + let mut temp = fx.bcx.ins().band(x, y); + for j in 1..=i { + let x = extract_bit(fx, temp1, j); + let y = extract_bit(fx, temp2, i - j); + let z = fx.bcx.ins().band(x, y); + temp = fx.bcx.ins().bxor(temp, z); + } + let temp = fx.bcx.ins().ishl_imm(temp, i); + res1 = fx.bcx.ins().bor(res1, temp); + } + ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted()); + + let mut res2 = fx.bcx.ins().iconst(types::I64, 0); + for i in 64..=127 { + let mut temp = fx.bcx.ins().iconst(types::I64, 0); + for j in i - 63..=63 { + let x = extract_bit(fx, temp1, j); + let y = extract_bit(fx, temp2, i - j); + let z = fx.bcx.ins().band(x, y); + temp = fx.bcx.ins().bxor(temp, z); + } + let temp = fx.bcx.ins().ishl_imm(temp, i); + res2 = fx.bcx.ins().bor(res2, temp); + } + ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted()); + } + + "llvm.x86.avx.ptestz.256" => { + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256&ig_expand=6945 + intrinsic_args!(fx, args => (a, b); intrinsic); + + assert_eq!(a.layout(), b.layout()); + let layout = a.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + assert_eq!(lane_ty, fx.tcx.types.i64); + assert_eq!(ret.layout().ty, fx.tcx.types.i32); + assert_eq!(lane_count, 4); + + let a_lane0 = a.value_lane(fx, 0).load_scalar(fx); + let a_lane1 = a.value_lane(fx, 1).load_scalar(fx); + let a_lane2 = a.value_lane(fx, 2).load_scalar(fx); + let a_lane3 = a.value_lane(fx, 3).load_scalar(fx); + let b_lane0 = b.value_lane(fx, 0).load_scalar(fx); + let b_lane1 = b.value_lane(fx, 1).load_scalar(fx); + let b_lane2 = b.value_lane(fx, 2).load_scalar(fx); + let b_lane3 = b.value_lane(fx, 3).load_scalar(fx); + + let zero0 = fx.bcx.ins().band(a_lane0, b_lane0); + let zero1 = fx.bcx.ins().band(a_lane1, b_lane1); + let zero2 = fx.bcx.ins().band(a_lane2, b_lane2); + let zero3 = fx.bcx.ins().band(a_lane3, b_lane3); + + let all_zero0 = fx.bcx.ins().bor(zero0, zero1); + let all_zero1 = fx.bcx.ins().bor(zero2, zero3); + let all_zero = fx.bcx.ins().bor(all_zero0, all_zero1); + + let res = fx.bcx.ins().icmp_imm(IntCC::Equal, all_zero, 0); + let res = CValue::by_val( + fx.bcx.ins().uextend(types::I32, res), + fx.layout_of(fx.tcx.types.i32), + ); + ret.write_cvalue(fx, res); + } + _ => { fx.tcx .sess diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs index 83d5d53624eb1..bfeeb117ff5b3 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs @@ -132,6 +132,65 @@ fn simd_pair_for_each_lane<'tcx>( } } +fn simd_horizontal_pair_for_each_lane<'tcx>( + fx: &mut FunctionCx<'_, '_, 'tcx>, + x: CValue<'tcx>, + y: CValue<'tcx>, + ret: CPlace<'tcx>, + f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value) -> Value, +) { + assert_eq!(x.layout(), y.layout()); + let layout = x.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let lane_layout = fx.layout_of(lane_ty); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + let ret_lane_layout = fx.layout_of(ret_lane_ty); + assert_eq!(lane_count, ret_lane_count); + + for lane_idx in 0..lane_count { + let src = if lane_idx < (lane_count / 2) { x } else { y }; + let src_idx = lane_idx % (lane_count / 2); + + let lhs_lane = src.value_lane(fx, src_idx * 2).load_scalar(fx); + let rhs_lane = src.value_lane(fx, src_idx * 2 + 1).load_scalar(fx); + + let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, lhs_lane, rhs_lane); + let res_lane = CValue::by_val(res_lane, ret_lane_layout); + + ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane); + } +} + +fn simd_trio_for_each_lane<'tcx>( + fx: &mut FunctionCx<'_, '_, 'tcx>, + x: CValue<'tcx>, + y: CValue<'tcx>, + z: CValue<'tcx>, + ret: CPlace<'tcx>, + f: &dyn Fn(&mut FunctionCx<'_, '_, 'tcx>, Ty<'tcx>, Ty<'tcx>, Value, Value, Value) -> Value, +) { + assert_eq!(x.layout(), y.layout()); + let layout = x.layout(); + + let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let lane_layout = fx.layout_of(lane_ty); + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + let ret_lane_layout = fx.layout_of(ret_lane_ty); + assert_eq!(lane_count, ret_lane_count); + + for lane_idx in 0..lane_count { + let x_lane = x.value_lane(fx, lane_idx).load_scalar(fx); + let y_lane = y.value_lane(fx, lane_idx).load_scalar(fx); + let z_lane = z.value_lane(fx, lane_idx).load_scalar(fx); + + let res_lane = f(fx, lane_layout.ty, ret_lane_layout.ty, x_lane, y_lane, z_lane); + let res_lane = CValue::by_val(res_lane, ret_lane_layout); + + ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane); + } +} + fn simd_reduce<'tcx>( fx: &mut FunctionCx<'_, '_, 'tcx>, val: CValue<'tcx>, diff --git a/compiler/rustc_codegen_cranelift/src/value_and_place.rs b/compiler/rustc_codegen_cranelift/src/value_and_place.rs index 5f0aa6c5581dd..21ad2a835fc96 100644 --- a/compiler/rustc_codegen_cranelift/src/value_and_place.rs +++ b/compiler/rustc_codegen_cranelift/src/value_and_place.rs @@ -243,6 +243,34 @@ impl<'tcx> CValue<'tcx> { let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx); let lane_layout = fx.layout_of(lane_ty); assert!(lane_idx < lane_count); + + match self.0 { + CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(), + CValueInner::ByRef(ptr, None) => { + let field_offset = lane_layout.size * lane_idx; + let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap()); + CValue::by_ref(field_ptr, lane_layout) + } + CValueInner::ByRef(_, Some(_)) => unreachable!(), + } + } + + /// Like [`CValue::value_field`] except using the passed type as lane type instead of the one + /// specified by the vector type. + pub(crate) fn value_typed_lane( + self, + fx: &mut FunctionCx<'_, '_, 'tcx>, + lane_ty: Ty<'tcx>, + lane_idx: u64, + ) -> CValue<'tcx> { + let layout = self.1; + assert!(layout.ty.is_simd()); + let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let lane_layout = fx.layout_of(lane_ty); + assert!( + (lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size + ); + match self.0 { CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(), CValueInner::ByRef(ptr, None) => { @@ -734,6 +762,34 @@ impl<'tcx> CPlace<'tcx> { } } + /// Like [`CPlace::place_field`] except using the passed type as lane type instead of the one + /// specified by the vector type. + pub(crate) fn place_typed_lane( + self, + fx: &mut FunctionCx<'_, '_, 'tcx>, + lane_ty: Ty<'tcx>, + lane_idx: u64, + ) -> CPlace<'tcx> { + let layout = self.layout(); + assert!(layout.ty.is_simd()); + let (orig_lane_count, orig_lane_ty) = layout.ty.simd_size_and_type(fx.tcx); + let lane_layout = fx.layout_of(lane_ty); + assert!( + (lane_idx + 1) * lane_layout.size <= orig_lane_count * fx.layout_of(orig_lane_ty).size + ); + + match self.inner { + CPlaceInner::Var(_, _) => unreachable!(), + CPlaceInner::VarPair(_, _, _) => unreachable!(), + CPlaceInner::Addr(ptr, None) => { + let field_offset = lane_layout.size * lane_idx; + let field_ptr = ptr.offset_i64(fx, i64::try_from(field_offset.bytes()).unwrap()); + CPlace::for_ptr(field_ptr, lane_layout) + } + CPlaceInner::Addr(_, Some(_)) => unreachable!(), + } + } + pub(crate) fn place_index( self, fx: &mut FunctionCx<'_, '_, 'tcx>,