diff --git a/cranelift/codegen/meta/src/isa/x86.rs b/cranelift/codegen/meta/src/isa/x86.rs index b6b0fb632967..5b59f87a5e64 100644 --- a/cranelift/codegen/meta/src/isa/x86.rs +++ b/cranelift/codegen/meta/src/isa/x86.rs @@ -121,6 +121,7 @@ pub(crate) fn define() -> TargetIsa { settings.add_predicate("use_popcnt", predicate!(has_popcnt && has_sse42)); settings.add_predicate("use_bmi1", predicate!(has_bmi1)); + settings.add_predicate("use_bmi2", predicate!(has_bmi2)); settings.add_predicate("use_lzcnt", predicate!(has_lzcnt)); let sse3 = settings.add_preset("sse3", "SSE3 and earlier.", preset!(has_sse3)); diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 40f478073a12..e74a265c3cec 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -32,7 +32,7 @@ (AluRmRVex (size OperandSize) (op AluRmROpcode) (src1 Gpr) - (src2 Gpr) + (src2 GprMem) (dst WritableGpr)) ;; Production of a zero value into a register of the specified size. @@ -53,6 +53,13 @@ (src GprMem) (dst WritableGpr)) + ;; Same as `UnaryRmRVex` but with an immediate + (UnaryRmRImmVex (size OperandSize) + (op UnaryRmRImmVexOpcode) + (src GprMem) + (dst WritableGpr) + (imm u8)) + ;; Bitwise not. (Not (size OperandSize) ;; 1, 2, 4, or 8 (src Gpr) @@ -746,8 +753,12 @@ Xor Mul)) -(type AluRmROpcode extern - (enum Andn)) +(type AluRmROpcode + (enum Andn + Sarx + Shrx + Shlx + Bzhi)) (type UnaryRmROpcode extern (enum Bsr @@ -761,6 +772,9 @@ Blsmsk Blsr)) +(type UnaryRmRImmVexOpcode + (enum Rorx)) + (type SseOpcode extern (enum Addps Addpd @@ -1433,6 +1447,14 @@ (decl imm8_reg_to_imm8_gpr (Imm8Reg) Imm8Gpr) (extern constructor imm8_reg_to_imm8_gpr imm8_reg_to_imm8_gpr) +;; Convert an `Imm8Gpr` into a `Gpr`. +(decl gpr_from_imm8_gpr (Gpr) Imm8Gpr) +(extern extractor gpr_from_imm8_gpr gpr_from_imm8_gpr) + +;; Convert an `Imm8Gpr` into an `Imm8`. +(decl imm8_from_imm8_gpr (u8) Imm8Gpr) +(extern extractor imm8_from_imm8_gpr imm8_from_imm8_gpr) + ;; Convert a `WritableGpr` to a `WritableReg`. (decl writable_gpr_to_reg (WritableGpr) WritableReg) (extern constructor writable_gpr_to_reg writable_gpr_to_reg) @@ -1703,6 +1725,9 @@ (decl pure use_bmi1 () bool) (extern constructor use_bmi1 use_bmi1) +(decl pure use_bmi2 () bool) +(extern constructor use_bmi2 use_bmi2) + (decl pure use_popcnt () bool) (extern constructor use_popcnt use_popcnt) @@ -1836,7 +1861,7 @@ dst)) ;; Helper for emitting `MInst.AluRmRVex` instructions. -(decl alu_rm_r_vex (Type AluRmROpcode Gpr Gpr) Gpr) +(decl alu_rm_r_vex (Type AluRmROpcode Gpr GprMem) Gpr) (rule (alu_rm_r_vex ty opcode src1 src2) (let ((dst WritableGpr (temp_writable_gpr)) (size OperandSize (operand_size_of_type_32_64 ty)) @@ -2060,6 +2085,13 @@ (_ Unit (emit (MInst.UnaryRmRVex size op src dst)))) dst)) +;; Helper for creating `MInst.UnaryRmRImmVex` instructions. +(decl unary_rm_r_imm_vex (UnaryRmRImmVexOpcode GprMem OperandSize u8) Gpr) +(rule (unary_rm_r_imm_vex op src size imm) + (let ((dst WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.UnaryRmRImmVex size op src dst imm)))) + dst)) + (decl cvt_u64_to_float_seq (Type Gpr) Xmm) (rule (cvt_u64_to_float_seq ty src) (let ((size OperandSize (raw_operand_size_of_type ty)) @@ -2584,7 +2616,7 @@ src1 src2)) -(decl x64_andn (Type Gpr Gpr) Gpr) +(decl x64_andn (Type Gpr GprMem) Gpr) (rule (x64_andn ty src1 src2) (alu_rm_r_vex ty (AluRmROpcode.Andn) src1 src2)) @@ -2669,26 +2701,55 @@ (decl x64_rotl (Type Gpr Imm8Gpr) Gpr) (rule (x64_rotl ty src1 src2) (shift_r ty (ShiftKind.RotateLeft) src1 src2)) +(rule 1 (x64_rotl (ty_32_or_64 ty) src (imm8_from_imm8_gpr imm)) + (if-let $true (use_bmi2)) + (x64_rorx ty src (u8_sub (ty_bits ty) imm))) ;; Helper for creating `rotr` instructions. (decl x64_rotr (Type Gpr Imm8Gpr) Gpr) (rule (x64_rotr ty src1 src2) (shift_r ty (ShiftKind.RotateRight) src1 src2)) +(rule 1 (x64_rotr (ty_32_or_64 ty) src (imm8_from_imm8_gpr imm)) + (if-let $true (use_bmi2)) + (x64_rorx ty src imm)) ;; Helper for creating `shl` instructions. (decl x64_shl (Type Gpr Imm8Gpr) Gpr) (rule (x64_shl ty src1 src2) (shift_r ty (ShiftKind.ShiftLeft) src1 src2)) +;; With BMI2 the `shlx` instruction is also available, and it's unconditionally +;; used for registers shifted by registers since it provides more freedom +;; in regalloc since nothing is constrained. Note that the `shlx` instruction +;; doesn't encode an immediate so any immediate-based shift still uses `shl`. +(rule 1 (x64_shl (ty_32_or_64 ty) src1 (gpr_from_imm8_gpr src2)) + (if-let $true (use_bmi2)) + (x64_shlx ty src1 src2)) ;; Helper for creating logical shift-right instructions. (decl x64_shr (Type Gpr Imm8Gpr) Gpr) (rule (x64_shr ty src1 src2) (shift_r ty (ShiftKind.ShiftRightLogical) src1 src2)) +;; see `x64_shl` for more info about this rule +(rule 1 (x64_shr (ty_32_or_64 ty) src1 (gpr_from_imm8_gpr src2)) + (if-let $true (use_bmi2)) + (x64_shrx ty src1 src2)) ;; Helper for creating arithmetic shift-right instructions. (decl x64_sar (Type Gpr Imm8Gpr) Gpr) (rule (x64_sar ty src1 src2) (shift_r ty (ShiftKind.ShiftRightArithmetic) src1 src2)) +;; see `x64_shl` for more info about this rule +(rule 1 (x64_sar (ty_32_or_64 ty) src1 (gpr_from_imm8_gpr src2)) + (if-let $true (use_bmi2)) + (x64_sarx ty src1 src2)) + +;; Helper for creating zeroing-of-high-bits instructions bzhi +;; +;; Note that the `src` operands are swapped here. The amount-to-shift-by +;; is stored in `vvvv` which is `src1` in the `AluRmRVex` instruction shape. +(decl x64_bzhi (Type GprMem Gpr) Gpr) +(rule (x64_bzhi ty src1 src2) + (alu_rm_r_vex ty (AluRmROpcode.Bzhi) src2 src1)) ;; Helper for creating byteswap instructions. ;; In x64, 32- and 64-bit registers use BSWAP instruction, and @@ -4096,6 +4157,29 @@ (rule (x64_blsr ty src) (unary_rm_r_vex (UnaryRmRVexOpcode.Blsr) src (operand_size_of_type_32_64 ty))) +;; Helper for creating `sarx` instructions. +(decl x64_sarx (Type GprMem Gpr) Gpr) +(rule (x64_sarx ty val amt) + (alu_rm_r_vex ty (AluRmROpcode.Sarx) amt val)) + +;; Helper for creating `shrx` instructions. +(decl x64_shrx (Type GprMem Gpr) Gpr) +(rule (x64_shrx ty val amt) + (alu_rm_r_vex ty (AluRmROpcode.Shrx) amt val)) + +;; Helper for creating `shlx` instructions. +(decl x64_shlx (Type GprMem Gpr) Gpr) +(rule (x64_shlx ty val amt) + (alu_rm_r_vex ty (AluRmROpcode.Shlx) amt val)) + +;; Helper for creating `rorx` instructions. +(decl x64_rorx (Type GprMem u8) Gpr) +(rule (x64_rorx ty src imm) + (unary_rm_r_imm_vex (UnaryRmRImmVexOpcode.Rorx) + src + (operand_size_of_type_32_64 ty) + imm)) + ;; Helper for creating `popcnt` instructions. (decl x64_popcnt (Type Gpr) Gpr) (rule (x64_popcnt ty src) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 7ffc2c072897..34ec6be1f186 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -826,33 +826,22 @@ impl fmt::Display for AluRmiROpcode { } } -/// ALU operations that don't accept intermediates. -#[derive(Copy, Clone, PartialEq)] -pub enum AluRmROpcode { - /// And with negated second operand. - Andn, -} +pub use crate::isa::x64::lower::isle::generated_code::AluRmROpcode; impl AluRmROpcode { pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> { match self { AluRmROpcode::Andn => smallvec![InstructionSet::BMI1], + AluRmROpcode::Sarx | AluRmROpcode::Shrx | AluRmROpcode::Shlx | AluRmROpcode::Bzhi => { + smallvec![InstructionSet::BMI2] + } } } } -impl fmt::Debug for AluRmROpcode { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - let name = match self { - AluRmROpcode::Andn => "andn", - }; - write!(fmt, "{}", name) - } -} - impl fmt::Display for AluRmROpcode { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - fmt::Debug::fmt(self, f) + f.write_str(&format!("{self:?}").to_lowercase()) } } @@ -918,6 +907,24 @@ impl fmt::Display for UnaryRmRVexOpcode { } } +pub use crate::isa::x64::lower::isle::generated_code::UnaryRmRImmVexOpcode; + +impl UnaryRmRImmVexOpcode { + pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> { + match self { + UnaryRmRImmVexOpcode::Rorx => { + smallvec![InstructionSet::BMI2] + } + } + } +} + +impl fmt::Display for UnaryRmRImmVexOpcode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(&format!("{self:?}").to_lowercase()) + } +} + #[derive(Clone, Copy, PartialEq)] /// Comparison operations. pub enum CmpOpcode { diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 652d60b2afad..722a1e1cae7c 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -358,9 +358,16 @@ pub(crate) fn emit( src2, } => { use AluRmROpcode::*; + use LegacyPrefixes as LP; + let dst = allocs.next(dst.to_reg().to_reg()); let src1 = allocs.next(src1.to_reg()); - let src2 = allocs.next(src2.to_reg()); + let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) { + RegMem::Reg { reg } => { + RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into()) + } + RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), + }; let w = match size { OperandSize::Size32 => false, @@ -370,16 +377,21 @@ pub(crate) fn emit( _ => unreachable!(), }; - let opcode = match op { - Andn => 0xf2, + let (prefix, opcode) = match op { + Andn => (LP::None, 0xf2), + Sarx => (LP::_F3, 0xf7), + Shrx => (LP::_F2, 0xf7), + Shlx => (LP::_66, 0xf7), + Bzhi => (LP::None, 0xf5), }; VexInstruction::new() + .prefix(prefix) .map(OpcodeMap::_0F38) .w(w) .reg(dst.to_real_reg().unwrap().hw_enc()) .vvvv(src1.to_real_reg().unwrap().hw_enc()) - .rm(src2.to_real_reg().unwrap().hw_enc()) + .rm(src2) .opcode(opcode) .encode(sink); } @@ -445,6 +457,36 @@ pub(crate) fn emit( .encode(sink); } + Inst::UnaryRmRImmVex { + size, + op, + src, + dst, + imm, + } => { + let dst = allocs.next(dst.to_reg().to_reg()); + let src = match src.clone().to_reg_mem().with_allocs(allocs) { + RegMem::Reg { reg } => { + RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into()) + } + RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)), + }; + + let opcode = match op { + UnaryRmRImmVexOpcode::Rorx => 0xF0, + }; + + VexInstruction::new() + .prefix(LegacyPrefixes::_F2) + .map(OpcodeMap::_0F3A) + .w(*size == OperandSize::Size64) + .opcode(opcode) + .reg(dst.to_real_reg().unwrap().hw_enc()) + .rm(src) + .imm(*imm) + .encode(sink); + } + Inst::Not { size, src, dst } => { let src = allocs.next(src.to_reg()); let dst = allocs.next(dst.to_reg().to_reg()); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 07590a5bc612..d943be5ca6e1 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -163,6 +163,7 @@ impl Inst { Inst::AluRmRVex { op, .. } => op.available_from(), Inst::UnaryRmR { op, .. } => op.available_from(), Inst::UnaryRmRVex { op, .. } => op.available_from(), + Inst::UnaryRmRImmVex { op, .. } => op.available_from(), // These use dynamic SSE opcodes. Inst::GprToXmm { op, .. } @@ -767,7 +768,7 @@ impl PrettyPrint for Inst { let size_bytes = size.to_bytes(); let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs); let src1 = pretty_print_reg(src1.to_reg(), size_bytes, allocs); - let src2 = pretty_print_reg(src2.to_reg(), size_bytes, allocs); + let src2 = src2.pretty_print(size_bytes, allocs); let op = ljustify2(op.to_string(), String::new()); format!("{op} {src2}, {src1}, {dst}") } @@ -785,6 +786,21 @@ impl PrettyPrint for Inst { format!("{op} {src}, {dst}") } + Inst::UnaryRmRImmVex { + src, + dst, + op, + size, + imm, + } => { + let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs); + let src = src.pretty_print(size.to_bytes(), allocs); + format!( + "{} ${imm}, {src}, {dst}", + ljustify2(op.to_string(), suffix_bwlq(*size)) + ) + } + Inst::Not { size, src, dst } => { let src = pretty_print_reg(src.to_reg(), size.to_bytes(), allocs); let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs); @@ -1885,7 +1901,7 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol } => { collector.reg_def(dst.to_writable_reg()); collector.reg_use(src1.to_reg()); - collector.reg_use(src2.to_reg()); + src2.get_operands(collector); } Inst::Not { src, dst, .. } => { collector.reg_use(src.to_reg()); @@ -1970,7 +1986,9 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol } } } - Inst::UnaryRmR { src, dst, .. } | Inst::UnaryRmRVex { src, dst, .. } => { + Inst::UnaryRmR { src, dst, .. } + | Inst::UnaryRmRVex { src, dst, .. } + | Inst::UnaryRmRImmVex { src, dst, .. } => { collector.reg_def(dst.to_writable_reg()); src.get_operands(collector); } diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 7820cdb2b518..76a97eb8e2d5 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -347,22 +347,43 @@ ;; Specialization of `blsr` for BMI1 -(rule 12 (lower (has_type (ty_32_or_64 ty) (band (isub x (iconst (u64_from_imm64 1))) x))) +(decl pure partial val_minus_one (Value) Value) +(rule 0 (val_minus_one (isub x (u64_from_iconst 1))) x) +(rule 0 (val_minus_one (iadd x (i64_from_iconst -1))) x) +(rule 1 (val_minus_one (iadd (i64_from_iconst -1) x)) x) + +(rule 12 (lower (has_type (ty_32_or_64 ty) (band x y))) (if-let $true (use_bmi1)) + (if-let x (val_minus_one y)) (x64_blsr ty x)) -(rule 13 (lower (has_type (ty_32_or_64 ty) (band x (isub x (iconst (u64_from_imm64 1)))))) +(rule 13 (lower (has_type (ty_32_or_64 ty) (band y x))) (if-let $true (use_bmi1)) + (if-let x (val_minus_one y)) (x64_blsr ty x)) ;; Specialization of `blsi` for BMI1 -(rule 12 (lower (has_type (ty_32_or_64 ty) (band (ineg x) x))) +(rule 14 (lower (has_type (ty_32_or_64 ty) (band (ineg x) x))) (if-let $true (use_bmi1)) (x64_blsi ty x)) -(rule 13 (lower (has_type (ty_32_or_64 ty) (band x (ineg x)))) +(rule 15 (lower (has_type (ty_32_or_64 ty) (band x (ineg x)))) (if-let $true (use_bmi1)) (x64_blsi ty x)) +;; Specialization of `bzhi` for BMI2 +;; +;; The `bzhi` instruction clears all bits indexed by the second operand of the +;; first operand. This is pattern-matched here with a `band` against a mask +;; which is generated to be N bits large. Note that if the index is larger than +;; the bit-width of the type then `bzhi` doesn't have the same semantics as +;; `ishl`, so an `and` instruction is required to mask the index to match the +;; semantics of Cranelift's `ishl`. + +(rule 16 (lower (has_type (ty_32_or_64 ty) (band x y))) + (if-let $true (use_bmi2)) + (if-let (ishl (u64_from_iconst 1) index) (val_minus_one y)) + (x64_bzhi ty x (x64_and ty index (RegMemImm.Imm (u32_sub (ty_bits ty) 1))))) + ;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `{i,b}64` and smaller. @@ -459,11 +480,13 @@ ;; Specialization of `blsmsk` for BMI1 -(rule 8 (lower (has_type (ty_32_or_64 ty) (bxor (isub x (iconst (u64_from_imm64 1))) x))) +(rule 8 (lower (has_type (ty_32_or_64 ty) (bxor x y))) (if-let $true (use_bmi1)) + (if-let x (val_minus_one y)) (x64_blsmsk ty x)) -(rule 9 (lower (has_type (ty_32_or_64 ty) (bxor x (isub x (iconst (u64_from_imm64 1)))))) +(rule 9 (lower (has_type (ty_32_or_64 ty) (bxor y x))) (if-let $true (use_bmi1)) + (if-let x (val_minus_one y)) (x64_blsmsk ty x)) ;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index b242d3907e19..000edbe3f490 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -270,6 +270,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { self.backend.x64_flags.use_bmi1() } + #[inline] + fn use_bmi2(&mut self) -> bool { + self.backend.x64_flags.use_bmi2() + } + #[inline] fn use_popcnt(&mut self) -> bool { self.backend.x64_flags.use_popcnt() @@ -585,6 +590,20 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { Imm8Gpr::new(Imm8Reg::Imm8 { imm }).unwrap() } + fn gpr_from_imm8_gpr(&mut self, val: &Imm8Gpr) -> Option { + match val.clone().to_imm8_reg() { + Imm8Reg::Reg { reg } => Some(Gpr::new(reg).unwrap()), + Imm8Reg::Imm8 { .. } => None, + } + } + + fn imm8_from_imm8_gpr(&mut self, val: &Imm8Gpr) -> Option { + match val.clone().to_imm8_reg() { + Imm8Reg::Imm8 { imm } => Some(imm), + Imm8Reg::Reg { .. } => None, + } + } + #[inline] fn type_register_class(&mut self, ty: Type) -> Option { if is_int_or_ref_ty(ty) || ty == I128 { diff --git a/cranelift/codegen/src/isle_prelude.rs b/cranelift/codegen/src/isle_prelude.rs index 1454838e5610..043595160d4a 100644 --- a/cranelift/codegen/src/isle_prelude.rs +++ b/cranelift/codegen/src/isle_prelude.rs @@ -740,6 +740,11 @@ macro_rules! isle_common_prelude_methods { a >> b } + #[inline] + fn u8_sub(&mut self, a: u8, b: u8) -> u8 { + a.wrapping_sub(b) + } + #[inline] fn lane_type(&mut self, ty: Type) -> Type { ty.lane_type() diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index 2f4b720b1063..8dbd8044e3a3 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -123,6 +123,9 @@ (decl pure u8_shr (u8 u8) u8) (extern constructor u8_shr u8_shr) +(decl pure u8_sub (u8 u8) u8) +(extern constructor u8_sub u8_sub) + (decl pure u32_add (u32 u32) u32) (extern constructor u32_add u32_add) diff --git a/cranelift/filetests/filetests/isa/x64/bmi1.clif b/cranelift/filetests/filetests/isa/x64/bmi1.clif index bc90e5c72b32..d0dd2f81980a 100644 --- a/cranelift/filetests/filetests/isa/x64/bmi1.clif +++ b/cranelift/filetests/filetests/isa/x64/bmi1.clif @@ -17,7 +17,61 @@ block0(v0: i32): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; blsrl %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %blsr_i32_alt(i32) -> i32 { +block0(v0: i32): + v1 = iconst.i32 -1 + v2 = iadd v0, v1 + v3 = band v0, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; blsrl %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; blsrl %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %blsr_i32_alt2(i32) -> i32 { +block0(v0: i32): + v1 = iconst.i32 -1 + v2 = iadd v1, v0 + v3 = band v0, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; blsrl %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp @@ -44,7 +98,7 @@ block0(v0: i64): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp @@ -71,7 +125,7 @@ block0(v0: i32): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp @@ -98,7 +152,7 @@ block0(v0: i64): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp @@ -124,7 +178,7 @@ block0(v0: i32): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp @@ -150,7 +204,7 @@ block0(v0: i64): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp @@ -176,7 +230,7 @@ block0(v0: i32): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp @@ -202,7 +256,7 @@ block0(v0: i64): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp @@ -229,7 +283,7 @@ block0(v0: i32): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp @@ -256,7 +310,7 @@ block0(v0: i64): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp @@ -283,7 +337,7 @@ block0(v0: i32): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp @@ -310,7 +364,7 @@ block0(v0: i64): ; movq %rbp, %rsp ; popq %rbp ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; pushq %rbp diff --git a/cranelift/filetests/filetests/isa/x64/bmi2.clif b/cranelift/filetests/filetests/isa/x64/bmi2.clif new file mode 100644 index 000000000000..b65825742732 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/bmi2.clif @@ -0,0 +1,354 @@ +test compile precise-output +target x86_64 has_bmi2 + +function %sarx_i32(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; sarx %edi, %esi, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; sarxl %esi, %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %sarx_i64(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = sshr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; sarx %rdi, %rsi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; sarxq %rsi, %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %shrx_i32(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; shrx %edi, %esi, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; shrxl %esi, %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %shrx_i64(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = ushr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; shrx %rdi, %rsi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; shrxq %rsi, %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %shlx_i32(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; shlx %edi, %esi, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; shlxl %esi, %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %shlx_i64(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = ishl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; shlx %rdi, %rsi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; shlxq %rsi, %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %rorx_i32(i32) -> i32 { +block0(v0: i32): + v1 = iconst.i32 3 + v2 = rotr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; rorxl $3, %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; rorxl $3, %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %rorx_i64(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i64 3 + v2 = rotr v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; rorxq $3, %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; rorxq $3, %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %rorx_alt_i32(i32) -> i32 { +block0(v0: i32): + v1 = iconst.i32 3 + v2 = rotl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; rorxl $29, %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; rorxl $0x1d, %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %rorx_alt_i64(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i64 3 + v2 = rotl v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; rorxq $61, %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; rorxq $0x3d, %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %bzhi32(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = iconst.i32 1 + v3 = ishl v2, v1 + v4 = isub v3, v2 + v5 = band v0, v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rsi, %rcx +; andl %ecx, $31, %ecx +; bzhi %edi, %ecx, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rsi, %rcx +; andl $0x1f, %ecx +; bzhil %ecx, %edi, %eax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %bzhi64(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = iconst.i64 1 + v3 = ishl v2, v1 + v4 = isub v3, v2 + v5 = band v0, v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rsi, %rcx +; andq %rcx, $63, %rcx +; bzhi %rdi, %rcx, %rax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rsi, %rcx +; andq $0x3f, %rcx +; bzhiq %rcx, %rdi, %rax +; movq %rbp, %rsp +; popq %rbp +; retq + +function %bzhi32_mem(i64, i32) -> i32 { +block0(v0: i64, v1: i32): + v2 = iconst.i32 1 + v3 = ishl v2, v1 + v4 = isub v3, v2 + v5 = load.i32 v0+20 + v6 = band v5, v4 + return v6 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq %rsi, %rcx +; andl %ecx, $31, %ecx +; bzhi 20(%rdi), %ecx, %eax +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq %rsi, %rcx +; andl $0x1f, %ecx +; bzhil %ecx, 0x14(%rdi), %eax ; trap: heap_oob +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/rotl.clif b/cranelift/filetests/filetests/runtests/rotl.clif index 1755bee159b5..1aad67cecefa 100644 --- a/cranelift/filetests/filetests/runtests/rotl.clif +++ b/cranelift/filetests/filetests/runtests/rotl.clif @@ -2,6 +2,7 @@ test interpret test run target aarch64 target x86_64 +target x86_64 has_bmi2 target s390x target riscv64 target riscv64 has_c has_zcb @@ -242,3 +243,25 @@ block0(v0: i8): ; run: %rotl_i8_const_37(0x00) == 0x00 ; run: %rotl_i8_const_37(0x01) == 0x20 ; run: %rotl_i8_const_37(0x12) == 0x42 + +function %rotl_i32_const4(i32) -> i32 { +block0(v0: i32): + v1 = iconst.i32 4 + v2 = rotl v0, v1 + return v2 +} +; run: %rotl_i32_const4(0xe0) == 0xe00 +; run: %rotl_i32_const4(0xe0000000) == 0xe +; run: %rotl_i32_const4(0) == 0 +; run: %rotl_i32_const4(0xa000000a) == 0xaa + +function %rotl_i64_const4(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i64 4 + v2 = rotl v0, v1 + return v2 +} +; run: %rotl_i64_const4(0xe0) == 0xe00 +; run: %rotl_i64_const4(0xe000000000000000) == 0xe +; run: %rotl_i64_const4(0) == 0 +; run: %rotl_i64_const4(0xa00000000000000a) == 0xaa diff --git a/cranelift/filetests/filetests/runtests/rotr.clif b/cranelift/filetests/filetests/runtests/rotr.clif index 44e588f619ee..2065f0220ef5 100644 --- a/cranelift/filetests/filetests/runtests/rotr.clif +++ b/cranelift/filetests/filetests/runtests/rotr.clif @@ -2,6 +2,7 @@ test interpret test run target aarch64 target x86_64 +target x86_64 has_bmi2 target s390x target riscv64 target riscv64 has_c has_zcb @@ -243,3 +244,25 @@ block0(v0: i8): ; run: %rotr_i8_const_37(0x00) == 0x00 ; run: %rotr_i8_const_37(0x01) == 0x08 ; run: %rotr_i8_const_37(0x12) == 0x90 + +function %rotr_i32_const4(i32) -> i32 { +block0(v0: i32): + v1 = iconst.i32 4 + v2 = rotr v0, v1 + return v2 +} +; run: %rotr_i32_const4(0xe0) == 0xe +; run: %rotr_i32_const4(0xe) == 0xe0000000 +; run: %rotr_i32_const4(0) == 0 +; run: %rotr_i32_const4(0xa000000a) == 0xaa000000 + +function %rotr_i64_const4(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i64 4 + v2 = rotr v0, v1 + return v2 +} +; run: %rotr_i64_const4(0xe0) == 0xe +; run: %rotr_i64_const4(0xe) == 0xe000000000000000 +; run: %rotr_i64_const4(0) == 0 +; run: %rotr_i64_const4(0xa00000000000000a) == 0xaa00000000000000 diff --git a/cranelift/filetests/filetests/runtests/shift-right-left.clif b/cranelift/filetests/filetests/runtests/shift-right-left.clif index bacbadec6036..35cfadcbfdb6 100644 --- a/cranelift/filetests/filetests/runtests/shift-right-left.clif +++ b/cranelift/filetests/filetests/runtests/shift-right-left.clif @@ -4,6 +4,7 @@ test interpret test run target aarch64 target x86_64 +target x86_64 has_bmi2 target riscv64 target riscv64 has_c has_zcb target s390x diff --git a/cranelift/filetests/filetests/runtests/shifts.clif b/cranelift/filetests/filetests/runtests/shifts.clif index 983316a71e07..9860e2851ad4 100644 --- a/cranelift/filetests/filetests/runtests/shifts.clif +++ b/cranelift/filetests/filetests/runtests/shifts.clif @@ -2,6 +2,7 @@ test interpret test run target aarch64 target x86_64 +target x86_64 has_bmi2 target s390x target riscv64 target riscv64 has_c has_zcb diff --git a/cranelift/filetests/filetests/runtests/x64-bmi2.clif b/cranelift/filetests/filetests/runtests/x64-bmi2.clif new file mode 100644 index 000000000000..baeec19ee1c0 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/x64-bmi2.clif @@ -0,0 +1,56 @@ +test interpret +test run +target aarch64 +target s390x +target x86_64 +target x86_64 has_bmi2 +target riscv64 + +function %bzhi32(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = iconst.i32 1 + v3 = ishl v2, v1 + v4 = isub v3, v2 + v5 = band v0, v4 + return v5 +} + +; run: %bzhi32(0, 0) == 0 +; run: %bzhi32(0, 1) == 0 +; run: %bzhi32(1, 0) == 0 +; run: %bzhi32(1, 32) == 0 +; run: %bzhi32(1, 8) == 1 +; run: %bzhi32(0xff, 4) == 0xf +; run: %bzhi32(0xff, 1) == 1 +; run: %bzhi32(0xff, 0) == 0 +; run: %bzhi32(0xff, 0xff0001) == 1 +; run: %bzhi32(0x01234567, 16) == 0x4567 +; run: %bzhi32(0x01234567, 32) == 0 +; run: %bzhi32(0x01234567, 28) == 0x1234567 +; run: %bzhi32(0x01234567, 24) == 0x234567 +; run: %bzhi32(-1, 24) == 0xffffff + +function %bzhi64(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = iconst.i64 1 + v3 = ishl v2, v1 + v4 = isub v3, v2 + v5 = band v0, v4 + return v5 +} + +; run: %bzhi64(0, 0) == 0 +; run: %bzhi64(0, 1) == 0 +; run: %bzhi64(1, 0) == 0 +; run: %bzhi64(1, 64) == 0 +; run: %bzhi64(1, 8) == 1 +; run: %bzhi64(0xff, 4) == 0xf +; run: %bzhi64(0xff, 1) == 1 +; run: %bzhi64(0xff, 0) == 0 +; run: %bzhi64(0xff, 0xff0001) == 1 +; run: %bzhi64(0x01234567, 16) == 0x4567 +; run: %bzhi64(0x01234567, 64) == 0 +; run: %bzhi64(0x01234567, 28) == 0x1234567 +; run: %bzhi64(0x01234567, 24) == 0x234567 +; run: %bzhi64(-1, 24) == 0xffffff +; run: %bzhi64(-1, 56) == 0xffffffffffffff diff --git a/cranelift/filetests/filetests/wasm/x64-bmi1.wat b/cranelift/filetests/filetests/wasm/x64-bmi1.wat new file mode 100644 index 000000000000..b23120c780b9 --- /dev/null +++ b/cranelift/filetests/filetests/wasm/x64-bmi1.wat @@ -0,0 +1,176 @@ +;;! target = "x86_64" +;;! compile = true +;;! settings = ["has_bmi1", "opt_level=speed", "has_avx"] + +(module + (func (export "blsi32") (param i32) (result i32) + (i32.and + (local.get 0) + (i32.sub (i32.const 0) (local.get 0)))) + + (func (export "blsi64") (param i64) (result i64) + (i64.and + (local.get 0) + (i64.sub (i64.const 0) (local.get 0)))) + + (func (export "blsr32") (param i32) (result i32) + (i32.and + (local.get 0) + (i32.add (local.get 0) (i32.const -1)))) + + (func (export "blsr64") (param i64) (result i64) + (i64.and + (local.get 0) + (i64.sub (local.get 0) (i64.const 1)))) + + (func (export "blsmsk32") (param i32) (result i32) + (i32.xor + (local.get 0) + (i32.sub (local.get 0) (i32.const 1)))) + + (func (export "blsmsk64") (param i64) (result i64) + (i64.xor + (local.get 0) + (i64.add (local.get 0) (i64.const -1)))) + + (func (export "tzcnt32") (param i32) (result i32) + (i32.ctz (local.get 0))) + + (func (export "tzcnt64") (param i64) (result i64) + (i64.ctz (local.get 0))) + + (func (export "andn32") (param i32 i32) (result i32) + (i32.and (local.get 0) (i32.xor (local.get 1) (i32.const -1)))) + + (func (export "andn64") (param i64 i64) (result i64) + (i64.and (local.get 0) (i64.xor (local.get 1) (i64.const -1)))) +) +;; function u0:0: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; blsil %edi, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:1: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; blsiq %rdi, %rax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:2: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; blsrl %edi, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:3: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; blsrq %rdi, %rax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:4: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; blsmskl %edi, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:5: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; blsmskq %rdi, %rax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:6: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; tzcntl %edi, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:7: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; tzcntq %rdi, %rax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:8: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; andn %edi, %esi, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:9: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; andn %rdi, %rsi, %rax +;; movq %rbp, %rsp +;; popq %rbp +;; ret diff --git a/cranelift/filetests/filetests/wasm/x64-bmi2.wat b/cranelift/filetests/filetests/wasm/x64-bmi2.wat new file mode 100644 index 000000000000..21418a017e82 --- /dev/null +++ b/cranelift/filetests/filetests/wasm/x64-bmi2.wat @@ -0,0 +1,177 @@ +;;! target = "x86_64" +;;! compile = true +;;! settings = ["has_bmi2", "opt_level=speed", "has_avx"] + +(module + (func (export "bzhi32") (param i32 i32) (result i32) + (i32.and + (local.get 0) + (i32.sub + (i32.shl + (i32.const 1) + (local.get 1)) + (i32.const 1)))) + + (func (export "bzhi64") (param i64 i64) (result i64) + (i64.and + (local.get 0) + (i64.add + (i64.shl + (i64.const 1) + (local.get 1)) + (i64.const -1)))) + + (func (export "rorx32") (param i32) (result i32) + (i32.rotr (local.get 0) (i32.const 8))) + + (func (export "rorx64") (param i64) (result i64) + (i64.rotl (local.get 0) (i64.const 9))) + + (func (export "shlx32") (param i32 i32) (result i32) + (i32.shl (local.get 0) (local.get 1))) + (func (export "shlx64") (param i64 i64) (result i64) + (i64.shl (local.get 0) (local.get 1))) + + (func (export "shrx32") (param i32 i32) (result i32) + (i32.shr_u (local.get 0) (local.get 1))) + (func (export "shrx64") (param i64 i64) (result i64) + (i64.shr_u (local.get 0) (local.get 1))) + + (func (export "sarx32") (param i32 i32) (result i32) + (i32.shr_s (local.get 0) (local.get 1))) + (func (export "sarx64") (param i64 i64) (result i64) + (i64.shr_s (local.get 0) (local.get 1))) +) +;; function u0:0: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; movq %rsi, %r8 +;; andl %r8d, $31, %r8d +;; bzhi %edi, %r8d, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:1: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; movq %rsi, %r8 +;; andq %r8, $63, %r8 +;; bzhi %rdi, %r8, %rax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:2: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; rorxl $8, %edi, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:3: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; rorxq $55, %rdi, %rax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:4: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; shlx %edi, %esi, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:5: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; shlx %rdi, %rsi, %rax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:6: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; shrx %edi, %esi, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:7: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; shrx %rdi, %rsi, %rax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:8: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; sarx %edi, %esi, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; ret +;; +;; function u0:9: +;; pushq %rbp +;; unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } +;; movq %rsp, %rbp +;; unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } +;; block0: +;; jmp label1 +;; block1: +;; sarx %rdi, %rsi, %rax +;; movq %rbp, %rsp +;; popq %rbp +;; ret