Skip to content

Commit

Permalink
x64: Add support for some BMI2 instructions (bytecodealliance#6976)
Browse files Browse the repository at this point in the history
* x64: Add support for sarx, shlx, and shrx

These instructions are in the BMI2 instruction set and unconditionally
used by LLVM for shifts which don't have an immediate. They're
equivalent to the sar, shl, and shr instructions except they use
3-operand form to lessen register allocation pressure.

Currently the integration here doesn't add new lowering but instead
takes an AVX-like approach by updating the `x64_sar` and related helpers
to use `sarx` instead if it fits. This means that the
shift-a-value-stored-in-memory functionality of `sarx` and friends isn't
exposed, so that's left for a future PR.

* x64: Add support for BMI2 `rorx` instruction

This is similar to `rol` and `ror` but requires an immediate argument
and additionally has no constraints on registers.

* x64: Add support for BMI2 `bzhi` instruction

This commit adds support for the `bzhi` instruction which is part of
BMI2. This instruction is used to zero out the upper bits of a register
indexed by a register operand. Emission of this instruction is
pattern-matched on CLIF which looks like this pattern. Equivalent code
fed to LLVM will additionally generate the `bzhi` instruction.

Relative to the alternative lowerings x64 provides this gives a little
bit more register freedom and additionally cuts down on a few
instructions. Note that the raw functionality of `bzhi` can't be exposed
though because the semantics of when the index is out-of-bounds doesn't
match easily to a CLIF instruction, so usage of `bzhi` is always
preceded by an `and` instruction. This matches LLVM as well, but LLVM
probably has fancy logic where if it can prove the range of values of
the index it probably elides the `and`.

* Pattern match more `x - 1` patterns

Looks like LLVM generates this as `x + (-1)` which is equivalent to `x - 1`
so create a custom partial constructor to pattern match the
possibilities of a decremented value.

* Add tests for BMI{1,2} coming from wasm

These are intended to serve as integration tests to ensure that even
coming from wasm these instructions are all emitted.
  • Loading branch information
alexcrichton authored and eduardomourar committed Sep 13, 2023
1 parent f751a89 commit 2794dd3
Show file tree
Hide file tree
Showing 18 changed files with 1,113 additions and 46 deletions.
1 change: 1 addition & 0 deletions cranelift/codegen/meta/src/isa/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ pub(crate) fn define() -> TargetIsa {

settings.add_predicate("use_popcnt", predicate!(has_popcnt && has_sse42));
settings.add_predicate("use_bmi1", predicate!(has_bmi1));
settings.add_predicate("use_bmi2", predicate!(has_bmi2));
settings.add_predicate("use_lzcnt", predicate!(has_lzcnt));

let sse3 = settings.add_preset("sse3", "SSE3 and earlier.", preset!(has_sse3));
Expand Down
94 changes: 89 additions & 5 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
(AluRmRVex (size OperandSize)
(op AluRmROpcode)
(src1 Gpr)
(src2 Gpr)
(src2 GprMem)
(dst WritableGpr))

;; Production of a zero value into a register of the specified size.
Expand All @@ -53,6 +53,13 @@
(src GprMem)
(dst WritableGpr))

;; Same as `UnaryRmRVex` but with an immediate
(UnaryRmRImmVex (size OperandSize)
(op UnaryRmRImmVexOpcode)
(src GprMem)
(dst WritableGpr)
(imm u8))

;; Bitwise not.
(Not (size OperandSize) ;; 1, 2, 4, or 8
(src Gpr)
Expand Down Expand Up @@ -746,8 +753,12 @@
Xor
Mul))

(type AluRmROpcode extern
(enum Andn))
(type AluRmROpcode
(enum Andn
Sarx
Shrx
Shlx
Bzhi))

(type UnaryRmROpcode extern
(enum Bsr
Expand All @@ -761,6 +772,9 @@
Blsmsk
Blsr))

(type UnaryRmRImmVexOpcode
(enum Rorx))

(type SseOpcode extern
(enum Addps
Addpd
Expand Down Expand Up @@ -1433,6 +1447,14 @@
(decl imm8_reg_to_imm8_gpr (Imm8Reg) Imm8Gpr)
(extern constructor imm8_reg_to_imm8_gpr imm8_reg_to_imm8_gpr)

;; Convert an `Imm8Gpr` into a `Gpr`.
(decl gpr_from_imm8_gpr (Gpr) Imm8Gpr)
(extern extractor gpr_from_imm8_gpr gpr_from_imm8_gpr)

;; Convert an `Imm8Gpr` into an `Imm8`.
(decl imm8_from_imm8_gpr (u8) Imm8Gpr)
(extern extractor imm8_from_imm8_gpr imm8_from_imm8_gpr)

;; Convert a `WritableGpr` to a `WritableReg`.
(decl writable_gpr_to_reg (WritableGpr) WritableReg)
(extern constructor writable_gpr_to_reg writable_gpr_to_reg)
Expand Down Expand Up @@ -1703,6 +1725,9 @@
(decl pure use_bmi1 () bool)
(extern constructor use_bmi1 use_bmi1)

(decl pure use_bmi2 () bool)
(extern constructor use_bmi2 use_bmi2)

(decl pure use_popcnt () bool)
(extern constructor use_popcnt use_popcnt)

Expand Down Expand Up @@ -1836,7 +1861,7 @@
dst))

;; Helper for emitting `MInst.AluRmRVex` instructions.
(decl alu_rm_r_vex (Type AluRmROpcode Gpr Gpr) Gpr)
(decl alu_rm_r_vex (Type AluRmROpcode Gpr GprMem) Gpr)
(rule (alu_rm_r_vex ty opcode src1 src2)
(let ((dst WritableGpr (temp_writable_gpr))
(size OperandSize (operand_size_of_type_32_64 ty))
Expand Down Expand Up @@ -2060,6 +2085,13 @@
(_ Unit (emit (MInst.UnaryRmRVex size op src dst))))
dst))

;; Helper for creating `MInst.UnaryRmRImmVex` instructions.
(decl unary_rm_r_imm_vex (UnaryRmRImmVexOpcode GprMem OperandSize u8) Gpr)
(rule (unary_rm_r_imm_vex op src size imm)
(let ((dst WritableGpr (temp_writable_gpr))
(_ Unit (emit (MInst.UnaryRmRImmVex size op src dst imm))))
dst))

(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
(rule (cvt_u64_to_float_seq ty src)
(let ((size OperandSize (raw_operand_size_of_type ty))
Expand Down Expand Up @@ -2584,7 +2616,7 @@
src1
src2))

(decl x64_andn (Type Gpr Gpr) Gpr)
(decl x64_andn (Type Gpr GprMem) Gpr)
(rule (x64_andn ty src1 src2)
(alu_rm_r_vex ty (AluRmROpcode.Andn) src1 src2))

Expand Down Expand Up @@ -2669,26 +2701,55 @@
(decl x64_rotl (Type Gpr Imm8Gpr) Gpr)
(rule (x64_rotl ty src1 src2)
(shift_r ty (ShiftKind.RotateLeft) src1 src2))
(rule 1 (x64_rotl (ty_32_or_64 ty) src (imm8_from_imm8_gpr imm))
(if-let $true (use_bmi2))
(x64_rorx ty src (u8_sub (ty_bits ty) imm)))

;; Helper for creating `rotr` instructions.
(decl x64_rotr (Type Gpr Imm8Gpr) Gpr)
(rule (x64_rotr ty src1 src2)
(shift_r ty (ShiftKind.RotateRight) src1 src2))
(rule 1 (x64_rotr (ty_32_or_64 ty) src (imm8_from_imm8_gpr imm))
(if-let $true (use_bmi2))
(x64_rorx ty src imm))

;; Helper for creating `shl` instructions.
(decl x64_shl (Type Gpr Imm8Gpr) Gpr)
(rule (x64_shl ty src1 src2)
(shift_r ty (ShiftKind.ShiftLeft) src1 src2))
;; With BMI2 the `shlx` instruction is also available, and it's unconditionally
;; used for registers shifted by registers since it provides more freedom
;; in regalloc since nothing is constrained. Note that the `shlx` instruction
;; doesn't encode an immediate so any immediate-based shift still uses `shl`.
(rule 1 (x64_shl (ty_32_or_64 ty) src1 (gpr_from_imm8_gpr src2))
(if-let $true (use_bmi2))
(x64_shlx ty src1 src2))

;; Helper for creating logical shift-right instructions.
(decl x64_shr (Type Gpr Imm8Gpr) Gpr)
(rule (x64_shr ty src1 src2)
(shift_r ty (ShiftKind.ShiftRightLogical) src1 src2))
;; see `x64_shl` for more info about this rule
(rule 1 (x64_shr (ty_32_or_64 ty) src1 (gpr_from_imm8_gpr src2))
(if-let $true (use_bmi2))
(x64_shrx ty src1 src2))

;; Helper for creating arithmetic shift-right instructions.
(decl x64_sar (Type Gpr Imm8Gpr) Gpr)
(rule (x64_sar ty src1 src2)
(shift_r ty (ShiftKind.ShiftRightArithmetic) src1 src2))
;; see `x64_shl` for more info about this rule
(rule 1 (x64_sar (ty_32_or_64 ty) src1 (gpr_from_imm8_gpr src2))
(if-let $true (use_bmi2))
(x64_sarx ty src1 src2))

;; Helper for creating zeroing-of-high-bits instructions bzhi
;;
;; Note that the `src` operands are swapped here. The amount-to-shift-by
;; is stored in `vvvv` which is `src1` in the `AluRmRVex` instruction shape.
(decl x64_bzhi (Type GprMem Gpr) Gpr)
(rule (x64_bzhi ty src1 src2)
(alu_rm_r_vex ty (AluRmROpcode.Bzhi) src2 src1))

;; Helper for creating byteswap instructions.
;; In x64, 32- and 64-bit registers use BSWAP instruction, and
Expand Down Expand Up @@ -4096,6 +4157,29 @@
(rule (x64_blsr ty src)
(unary_rm_r_vex (UnaryRmRVexOpcode.Blsr) src (operand_size_of_type_32_64 ty)))

;; Helper for creating `sarx` instructions.
(decl x64_sarx (Type GprMem Gpr) Gpr)
(rule (x64_sarx ty val amt)
(alu_rm_r_vex ty (AluRmROpcode.Sarx) amt val))

;; Helper for creating `shrx` instructions.
(decl x64_shrx (Type GprMem Gpr) Gpr)
(rule (x64_shrx ty val amt)
(alu_rm_r_vex ty (AluRmROpcode.Shrx) amt val))

;; Helper for creating `shlx` instructions.
(decl x64_shlx (Type GprMem Gpr) Gpr)
(rule (x64_shlx ty val amt)
(alu_rm_r_vex ty (AluRmROpcode.Shlx) amt val))

;; Helper for creating `rorx` instructions.
(decl x64_rorx (Type GprMem u8) Gpr)
(rule (x64_rorx ty src imm)
(unary_rm_r_imm_vex (UnaryRmRImmVexOpcode.Rorx)
src
(operand_size_of_type_32_64 ty)
imm))

;; Helper for creating `popcnt` instructions.
(decl x64_popcnt (Type Gpr) Gpr)
(rule (x64_popcnt ty src)
Expand Down
39 changes: 23 additions & 16 deletions cranelift/codegen/src/isa/x64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -826,33 +826,22 @@ impl fmt::Display for AluRmiROpcode {
}
}

/// ALU operations that don't accept intermediates.
#[derive(Copy, Clone, PartialEq)]
pub enum AluRmROpcode {
/// And with negated second operand.
Andn,
}
pub use crate::isa::x64::lower::isle::generated_code::AluRmROpcode;

impl AluRmROpcode {
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
match self {
AluRmROpcode::Andn => smallvec![InstructionSet::BMI1],
AluRmROpcode::Sarx | AluRmROpcode::Shrx | AluRmROpcode::Shlx | AluRmROpcode::Bzhi => {
smallvec![InstructionSet::BMI2]
}
}
}
}

impl fmt::Debug for AluRmROpcode {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
let name = match self {
AluRmROpcode::Andn => "andn",
};
write!(fmt, "{}", name)
}
}

impl fmt::Display for AluRmROpcode {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Debug::fmt(self, f)
f.write_str(&format!("{self:?}").to_lowercase())
}
}

Expand Down Expand Up @@ -918,6 +907,24 @@ impl fmt::Display for UnaryRmRVexOpcode {
}
}

pub use crate::isa::x64::lower::isle::generated_code::UnaryRmRImmVexOpcode;

impl UnaryRmRImmVexOpcode {
pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> {
match self {
UnaryRmRImmVexOpcode::Rorx => {
smallvec![InstructionSet::BMI2]
}
}
}
}

impl fmt::Display for UnaryRmRImmVexOpcode {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str(&format!("{self:?}").to_lowercase())
}
}

#[derive(Clone, Copy, PartialEq)]
/// Comparison operations.
pub enum CmpOpcode {
Expand Down
50 changes: 46 additions & 4 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -358,9 +358,16 @@ pub(crate) fn emit(
src2,
} => {
use AluRmROpcode::*;
use LegacyPrefixes as LP;

let dst = allocs.next(dst.to_reg().to_reg());
let src1 = allocs.next(src1.to_reg());
let src2 = allocs.next(src2.to_reg());
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};

let w = match size {
OperandSize::Size32 => false,
Expand All @@ -370,16 +377,21 @@ pub(crate) fn emit(
_ => unreachable!(),
};

let opcode = match op {
Andn => 0xf2,
let (prefix, opcode) = match op {
Andn => (LP::None, 0xf2),
Sarx => (LP::_F3, 0xf7),
Shrx => (LP::_F2, 0xf7),
Shlx => (LP::_66, 0xf7),
Bzhi => (LP::None, 0xf5),
};

VexInstruction::new()
.prefix(prefix)
.map(OpcodeMap::_0F38)
.w(w)
.reg(dst.to_real_reg().unwrap().hw_enc())
.vvvv(src1.to_real_reg().unwrap().hw_enc())
.rm(src2.to_real_reg().unwrap().hw_enc())
.rm(src2)
.opcode(opcode)
.encode(sink);
}
Expand Down Expand Up @@ -445,6 +457,36 @@ pub(crate) fn emit(
.encode(sink);
}

Inst::UnaryRmRImmVex {
size,
op,
src,
dst,
imm,
} => {
let dst = allocs.next(dst.to_reg().to_reg());
let src = match src.clone().to_reg_mem().with_allocs(allocs) {
RegMem::Reg { reg } => {
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
}
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
};

let opcode = match op {
UnaryRmRImmVexOpcode::Rorx => 0xF0,
};

VexInstruction::new()
.prefix(LegacyPrefixes::_F2)
.map(OpcodeMap::_0F3A)
.w(*size == OperandSize::Size64)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.rm(src)
.imm(*imm)
.encode(sink);
}

Inst::Not { size, src, dst } => {
let src = allocs.next(src.to_reg());
let dst = allocs.next(dst.to_reg().to_reg());
Expand Down
Loading

0 comments on commit 2794dd3

Please sign in to comment.