diff --git a/cranelift/codegen/meta/src/isa/x86.rs b/cranelift/codegen/meta/src/isa/x86.rs index b850c191158d..70e7645e9175 100644 --- a/cranelift/codegen/meta/src/isa/x86.rs +++ b/cranelift/codegen/meta/src/isa/x86.rs @@ -52,6 +52,12 @@ fn define_settings(shared: &SettingGroup) -> SettingGroup { "AVX2: CPUID.07H:EBX.AVX2[bit 5]", false, ); + let has_fma = settings.add_bool( + "has_fma", + "Has support for FMA.", + "FMA: CPUID.01H:ECX.FMA[bit 12]", + false, + ); let has_avx512bitalg = settings.add_bool( "has_avx512bitalg", "Has support for AVX512BITALG.", @@ -116,6 +122,7 @@ fn define_settings(shared: &SettingGroup) -> SettingGroup { settings.add_predicate("use_ssse3", predicate!(has_ssse3)); settings.add_predicate("use_sse41", predicate!(has_sse41)); settings.add_predicate("use_sse42", predicate!(has_sse41 && has_sse42)); + settings.add_predicate("use_fma", predicate!(has_avx && has_fma)); settings.add_predicate( "use_ssse3_simd", @@ -195,7 +202,7 @@ fn define_settings(shared: &SettingGroup) -> SettingGroup { let broadwell = settings.add_preset( "broadwell", "Broadwell microarchitecture.", - preset!(haswell), + preset!(haswell && has_fma), ); let skylake = settings.add_preset("skylake", "Skylake microarchitecture.", preset!(broadwell)); let cannonlake = settings.add_preset( diff --git a/cranelift/codegen/src/isa/x64/encoding/rex.rs b/cranelift/codegen/src/isa/x64/encoding/rex.rs index 9ff590baf1c4..bfa3c089bee4 100644 --- a/cranelift/codegen/src/isa/x64/encoding/rex.rs +++ b/cranelift/codegen/src/isa/x64/encoding/rex.rs @@ -159,6 +159,7 @@ impl From<(OperandSize, Reg)> for RexFlags { /// Allows using the same opcode byte in different "opcode maps" to allow for more instruction /// encodings. See appendix A in the Intel Software Developer's Manual, volume 2A, for more details. #[allow(missing_docs)] +#[derive(PartialEq)] pub enum OpcodeMap { None, _0F, @@ -168,7 +169,7 @@ pub enum OpcodeMap { impl OpcodeMap { /// Normally the opcode map is specified as bytes in the instruction, but some x64 encoding - /// formats pack this information as bits in a prefix (e.g. EVEX). + /// formats pack this information as bits in a prefix (e.g. VEX / EVEX). pub(crate) fn bits(&self) -> u8 { match self { OpcodeMap::None => 0b00, @@ -187,6 +188,7 @@ impl Default for OpcodeMap { /// We may need to include one or more legacy prefix bytes before the REX prefix. This enum /// covers only the small set of possibilities that we actually need. +#[derive(PartialEq)] pub enum LegacyPrefixes { /// No prefix bytes. None, diff --git a/cranelift/codegen/src/isa/x64/encoding/vex.rs b/cranelift/codegen/src/isa/x64/encoding/vex.rs index f2f3feebbae6..3aa9bb8d506d 100644 --- a/cranelift/codegen/src/isa/x64/encoding/vex.rs +++ b/cranelift/codegen/src/isa/x64/encoding/vex.rs @@ -1,2 +1,357 @@ //! Encodes VEX instructions. These instructions are those added by the Advanced Vector Extensions //! (AVX). + +use super::evex::Register; +use super::rex::{LegacyPrefixes, OpcodeMap}; +use super::ByteSink; +use crate::isa::x64::encoding::rex::encode_modrm; + +/// Constructs a VEX-encoded instruction using a builder pattern. This approach makes it visually +/// easier to transform something the manual's syntax, `VEX.128.66.0F 73 /7 ib` to code: +/// `VexInstruction::new().length(...).prefix(...).map(...).w(true).opcode(0x1F).reg(...).rm(...)`. +pub struct VexInstruction { + length: VexVectorLength, + prefix: LegacyPrefixes, + map: OpcodeMap, + opcode: u8, + w: bool, + reg: u8, + rm: Register, + vvvv: Option, + imm: Option, +} + +impl Default for VexInstruction { + fn default() -> Self { + Self { + length: VexVectorLength::default(), + prefix: LegacyPrefixes::None, + map: OpcodeMap::None, + opcode: 0x00, + w: false, + reg: 0x00, + rm: Register::default(), + vvvv: None, + imm: None, + } + } +} + +impl VexInstruction { + /// Construct a default VEX instruction. + pub fn new() -> Self { + Self::default() + } + + /// Set the length of the instruction. + #[inline(always)] + pub fn length(mut self, length: VexVectorLength) -> Self { + self.length = length; + self + } + + /// Set the legacy prefix byte of the instruction: None | 66 | F2 | F3. VEX instructions + /// pack these into the prefix, not as separate bytes. + #[inline(always)] + pub fn prefix(mut self, prefix: LegacyPrefixes) -> Self { + debug_assert!( + prefix == LegacyPrefixes::None + || prefix == LegacyPrefixes::_66 + || prefix == LegacyPrefixes::_F2 + || prefix == LegacyPrefixes::_F3 + ); + + self.prefix = prefix; + self + } + + /// Set the opcode map byte of the instruction: None | 0F | 0F38 | 0F3A. VEX instructions pack + /// these into the prefix, not as separate bytes. + #[inline(always)] + pub fn map(mut self, map: OpcodeMap) -> Self { + self.map = map; + self + } + + /// Set the W bit, denoted by `.W1` or `.W0` in the instruction string. + /// Typically used to indicate an instruction using 64 bits of an operand (e.g. + /// 64 bit lanes). EVEX packs this bit in the EVEX prefix; previous encodings used the REX + /// prefix. + #[inline(always)] + pub fn w(mut self, w: bool) -> Self { + self.w = w; + self + } + + /// Set the instruction opcode byte. + #[inline(always)] + pub fn opcode(mut self, opcode: u8) -> Self { + self.opcode = opcode; + self + } + + /// Set the register to use for the `reg` bits; many instructions use this as the write operand. + #[inline(always)] + pub fn reg(mut self, reg: impl Into) -> Self { + self.reg = reg.into().into(); + self + } + + /// Some instructions use the ModRM.reg field as an opcode extension. This is usually denoted by + /// a `/n` field in the manual. + #[inline(always)] + pub fn opcode_ext(mut self, n: u8) -> Self { + self.reg = n; + self + } + + /// Set the register to use for the `rm` bits; many instructions use this as the "read from + /// register/memory" operand. Currently this does not support memory addressing (TODO).Setting + /// this affects both the ModRM byte (`rm` section) and the VEX prefix (the extension bits for + /// register encodings > 8). + #[inline(always)] + pub fn rm(mut self, reg: impl Into) -> Self { + self.rm = reg.into(); + self + } + + /// Set the `vvvv` register; some instructions allow using this as a second, non-destructive + /// source register in 3-operand instructions (e.g. 2 read, 1 write). + #[allow(dead_code)] + #[inline(always)] + pub fn vvvv(mut self, reg: impl Into) -> Self { + self.vvvv = Some(reg.into()); + self + } + + /// Set the imm byte when used for a register. The reg bits are stored in `imm8[7:4]` with + /// the lower bits unused. Overrides a previously set [Self::imm] field. + #[inline(always)] + pub fn imm_reg(mut self, reg: impl Into) -> Self { + let reg: u8 = reg.into().into(); + self.imm = Some((reg & 0xf) << 4); + self + } + + /// Set the imm byte. + /// Overrides a previously set [Self::imm_reg] field. + #[inline(always)] + pub fn imm(mut self, imm: u8) -> Self { + self.imm = Some(imm); + self + } + + /// The R bit in encoded format (inverted). + #[inline(always)] + fn r_bit(&self) -> u8 { + (!(self.reg >> 3)) & 1 + } + + /// The X bit in encoded format (inverted). + #[inline(always)] + fn x_bit(&self) -> u8 { + // TODO + (!0) & 1 + } + + /// The B bit in encoded format (inverted). + #[inline(always)] + fn b_bit(&self) -> u8 { + let rm: u8 = self.rm.into(); + (!(rm >> 3)) & 1 + } + + /// Is the 2 byte prefix available for this instruction? + /// We essentially just check if we need any of the bits that are only available + /// in the 3 byte instruction + #[inline(always)] + fn use_2byte_prefix(&self) -> bool { + // These bits are only represented on the 3 byte prefix, so their presence + // implies the use of the 3 byte prefix + self.b_bit() == 1 && self.x_bit() == 1 && + // The presence of W1 in the opcode column implies the opcode must be encoded using the + // 3-byte form of the VEX prefix. + self.w == false && + // The presence of 0F3A and 0F38 in the opcode column implies that opcode can only be + // encoded by the three-byte form of VEX + !(self.map == OpcodeMap::_0F3A || self.map == OpcodeMap::_0F38) + } + /// The last byte of the 2byte and 3byte prefixes is mostly the same, share the common + /// encoding logic here. + #[inline(always)] + fn prefix_last_byte(&self) -> u8 { + let vvvv = self.vvvv.map(|r| r.into()).unwrap_or(0x00); + + let mut byte = 0x00; + byte |= self.prefix.bits(); + byte |= self.length.bits() << 2; + byte |= ((!vvvv) & 0xF) << 3; + byte + } + + /// Encode the 2 byte prefix + #[inline(always)] + fn encode_2byte_prefix(&self, sink: &mut CS) { + // 2 bytes: + // +-----+ +-------------------+ + // | C5h | | R | vvvv | L | pp | + // +-----+ +-------------------+ + + let last_byte = self.prefix_last_byte() | (self.r_bit() << 7); + + sink.put1(0xC5); + sink.put1(last_byte); + } + + /// Encode the 3 byte prefix + #[inline(always)] + fn encode_3byte_prefix(&self, sink: &mut CS) { + // 3 bytes: + // +-----+ +--------------+ +-------------------+ + // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | + // +-----+ +--------------+ +-------------------+ + + let mut second_byte = 0x00; + second_byte |= self.map.bits(); // m-mmmm field + second_byte |= self.b_bit() << 5; + second_byte |= self.x_bit() << 6; + second_byte |= self.r_bit() << 7; + + let w_bit = self.w as u8; + let last_byte = self.prefix_last_byte() | (w_bit << 7); + + sink.put1(0xC4); + sink.put1(second_byte); + sink.put1(last_byte); + } + + /// Emit the VEX-encoded instruction to the code sink: + pub fn encode(&self, sink: &mut CS) { + // 2/3 byte prefix + if self.use_2byte_prefix() { + self.encode_2byte_prefix(sink); + } else { + self.encode_3byte_prefix(sink); + } + + // 1 Byte Opcode + sink.put1(self.opcode); + + // 1 ModRM Byte + // Not all instructions use Reg as a reg, some use it as an extension of the opcode. + let rm: u8 = self.rm.into(); + sink.put1(encode_modrm(3, self.reg & 7, rm & 7)); + + // TODO: 0/1 byte SIB + // TODO: 0/1/2/4 bytes DISP + + // Optional 1 Byte imm + if let Some(imm) = self.imm { + sink.put1(imm); + } + } +} + +/// The VEX format allows choosing a vector length in the `L` bit. +#[allow(dead_code, missing_docs)] // Wider-length vectors are not yet used. +pub enum VexVectorLength { + V128, + V256, +} + +impl VexVectorLength { + /// Encode the `L` bit. + fn bits(&self) -> u8 { + match self { + Self::V128 => 0b0, + Self::V256 => 0b1, + } + } +} + +impl Default for VexVectorLength { + fn default() -> Self { + Self::V128 + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::isa::x64::inst::regs; + use std::vec::Vec; + + #[test] + fn vpslldq() { + // VEX.128.66.0F 73 /7 ib + // VPSLLDQ xmm1, xmm2, imm8 + + let dst = regs::xmm1().to_real_reg().unwrap().hw_enc(); + let src = regs::xmm2().to_real_reg().unwrap().hw_enc(); + let mut sink0 = Vec::new(); + + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(LegacyPrefixes::_66) + .map(OpcodeMap::_0F) + .opcode(0x73) + .opcode_ext(7) + .vvvv(dst) + .rm(src) + .imm(0x17) + .encode(&mut sink0); + + assert_eq!(sink0, vec![0xc5, 0xf1, 0x73, 0xfa, 0x17]); + } + + #[test] + fn vblendvpd() { + // A four operand instruction + // VEX.128.66.0F3A.W0 4B /r /is4 + // VBLENDVPD xmm1, xmm2, xmm3, xmm4 + + let dst = regs::xmm1().to_real_reg().unwrap().hw_enc(); + let a = regs::xmm2().to_real_reg().unwrap().hw_enc(); + let b = regs::xmm3().to_real_reg().unwrap().hw_enc(); + let c = regs::xmm4().to_real_reg().unwrap().hw_enc(); + let mut sink0 = Vec::new(); + + VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(LegacyPrefixes::_66) + .map(OpcodeMap::_0F3A) + .w(false) + .opcode(0x4B) + .reg(dst) + .vvvv(a) + .rm(b) + .imm_reg(c) + .encode(&mut sink0); + + assert_eq!(sink0, vec![0xc4, 0xe3, 0x69, 0x4b, 0xcb, 0x40]); + } + + #[test] + fn vcmpps() { + // VEX.128.0F.WIG C2 /r ib + // VCMPPS ymm10, ymm11, ymm12, 4 // neq + + let dst = regs::xmm10().to_real_reg().unwrap().hw_enc(); + let a = regs::xmm11().to_real_reg().unwrap().hw_enc(); + let b = regs::xmm12().to_real_reg().unwrap().hw_enc(); + let mut sink0 = Vec::new(); + + VexInstruction::new() + .length(VexVectorLength::V256) + .prefix(LegacyPrefixes::None) + .map(OpcodeMap::_0F) + .opcode(0xC2) + .reg(dst) + .vvvv(a) + .rm(b) + .imm(4) + .encode(&mut sink0); + + assert_eq!(sink0, vec![0xc4, 0x41, 0x24, 0xc2, 0xd4, 0x04]); + } +} diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 58396b569a90..5e2322a13a7c 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -193,6 +193,13 @@ (src2 XmmMem) (dst WritableXmm)) + ;; XMM (scalar or vector) binary op that relies on the VEX prefix. + (XmmRmRVex (op AvxOpcode) + (src1 Xmm) + (src2 Xmm) + (src3 XmmMem) + (dst WritableXmm)) + ;; XMM (scalar or vector) binary op that relies on the EVEX prefix. (XmmRmREvex (op Avx512Opcode) (src1 XmmMem) @@ -1042,6 +1049,10 @@ (decl intcc_to_cc (IntCC) CC) (extern constructor intcc_to_cc intcc_to_cc) +(type AvxOpcode extern + (enum Vfmadd213ps + Vfmadd213pd)) + (type Avx512Opcode extern (enum Vcvtudq2ps Vpabsq @@ -2839,6 +2850,28 @@ dst)) +;; Helper for creating `MInst.XmmRmRVex` instructions. +(decl xmm_rmr_vex (AvxOpcode Xmm Xmm XmmMem) Xmm) +(rule (xmm_rmr_vex op src1 src2 src3) + (let ((dst WritableXmm (temp_writable_xmm)) + (_ Unit (emit (MInst.XmmRmRVex op + src1 + src2 + src3 + dst)))) + dst)) + +;; Helper for creating `vfmadd213ps` instructions. +(decl x64_vfmadd213ps (Xmm Xmm XmmMem) Xmm) +(rule (x64_vfmadd213ps x y z) + (xmm_rmr_vex (AvxOpcode.Vfmadd213ps) x y z)) + +;; Helper for creating `vfmadd213pd` instructions. +(decl x64_vfmadd213pd (Xmm Xmm XmmMem) Xmm) +(rule (x64_vfmadd213pd x y z) + (xmm_rmr_vex (AvxOpcode.Vfmadd213pd) x y z)) + + ;; Helper for creating `sqrtss` instructions. (decl x64_sqrtss (Xmm) Xmm) (rule (x64_sqrtss x) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 6d5e29b999cb..04da226f89cb 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -794,6 +794,7 @@ pub(crate) enum InstructionSet { BMI1, #[allow(dead_code)] // never constructed (yet). BMI2, + FMA, AVX512BITALG, AVX512DQ, AVX512F, @@ -1386,6 +1387,38 @@ impl fmt::Display for SseOpcode { } } +#[derive(Clone, PartialEq)] +pub enum AvxOpcode { + Vfmadd213ps, + Vfmadd213pd, +} + +impl AvxOpcode { + /// Which `InstructionSet`s support the opcode? + pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> { + match self { + AvxOpcode::Vfmadd213ps => smallvec![InstructionSet::FMA], + AvxOpcode::Vfmadd213pd => smallvec![InstructionSet::FMA], + } + } +} + +impl fmt::Debug for AvxOpcode { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let name = match self { + AvxOpcode::Vfmadd213ps => "vfmadd213ps", + AvxOpcode::Vfmadd213pd => "vfmadd213pd", + }; + write!(fmt, "{}", name) + } +} + +impl fmt::Display for AvxOpcode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(self, f) + } +} + #[derive(Clone, PartialEq)] pub enum Avx512Opcode { Vcvtudq2ps, diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 9e9a66f881bb..4d4a6298c0f9 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -8,6 +8,7 @@ use crate::isa::x64::encoding::rex::{ low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, OpcodeMap, RexFlags, }; +use crate::isa::x64::encoding::vex::{VexInstruction, VexVectorLength}; use crate::isa::x64::inst::args::*; use crate::isa::x64::inst::*; use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel, Reg, Writable}; @@ -119,6 +120,7 @@ pub(crate) fn emit( InstructionSet::Lzcnt => info.isa_flags.use_lzcnt(), InstructionSet::BMI1 => info.isa_flags.use_bmi1(), InstructionSet::BMI2 => info.isa_flags.has_bmi2(), + InstructionSet::FMA => info.isa_flags.has_fma(), InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(), InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(), InstructionSet::AVX512F => info.isa_flags.has_avx512f(), @@ -1689,6 +1691,39 @@ pub(crate) fn emit( } } + Inst::XmmRmRVex { + op, + src1, + src2, + src3, + dst, + } => { + let src1 = allocs.next(src1.to_reg()); + let dst = allocs.next(dst.to_reg().to_reg()); + debug_assert_eq!(src1, dst); + let src2 = allocs.next(src2.to_reg()); + let src3 = src3.clone().to_reg_mem().with_allocs(allocs); + + let (w, opcode) = match op { + AvxOpcode::Vfmadd213ps => (false, 0xA8), + AvxOpcode::Vfmadd213pd => (true, 0xA8), + }; + + match src3 { + RegMem::Reg { reg: src } => VexInstruction::new() + .length(VexVectorLength::V128) + .prefix(LegacyPrefixes::_66) + .map(OpcodeMap::_0F38) + .w(w) + .opcode(opcode) + .reg(dst.to_real_reg().unwrap().hw_enc()) + .rm(src.to_real_reg().unwrap().hw_enc()) + .vvvv(src2.to_real_reg().unwrap().hw_enc()) + .encode(sink), + _ => todo!(), + }; + } + Inst::XmmRmREvex { op, src1, diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 4bcf936f142d..6f2cf2c085b4 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3701,6 +3701,21 @@ fn test_x64_emit() { "jmp *321(%r10,%rdx,4)", )); + // ======================================================== + // XMM FMA + + insns.push(( + Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213ps, RegMem::reg(xmm2), xmm1, w_xmm0), + "C4E271A8C2", + "vfmadd213ps %xmm0, %xmm1, %xmm2, %xmm0", + )); + + insns.push(( + Inst::xmm_rm_r_vex(AvxOpcode::Vfmadd213pd, RegMem::reg(xmm5), xmm4, w_xmm3), + "C4E2D9A8DD", + "vfmadd213pd %xmm3, %xmm4, %xmm5, %xmm3", + )); + // ======================================================== // XMM_CMP_RM_R @@ -4866,6 +4881,7 @@ fn test_x64_emit() { let mut isa_flag_builder = x64::settings::builder(); isa_flag_builder.enable("has_ssse3").unwrap(); isa_flag_builder.enable("has_sse41").unwrap(); + isa_flag_builder.enable("has_fma").unwrap(); isa_flag_builder.enable("has_avx512bitalg").unwrap(); isa_flag_builder.enable("has_avx512dq").unwrap(); isa_flag_builder.enable("has_avx512f").unwrap(); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index f888bb9af33a..a7de41a7d550 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -129,6 +129,8 @@ impl Inst { | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()], Inst::XmmUnaryRmREvex { op, .. } | Inst::XmmRmREvex { op, .. } => op.available_from(), + + Inst::XmmRmRVex { op, .. } => op.available_from(), } } } @@ -324,6 +326,20 @@ impl Inst { } } + #[cfg(test)] + pub(crate) fn xmm_rm_r_vex(op: AvxOpcode, src3: RegMem, src2: Reg, dst: Writable) -> Self { + src3.assert_regclass_is(RegClass::Float); + debug_assert!(src2.class() == RegClass::Float); + debug_assert!(dst.to_reg().class() == RegClass::Float); + Inst::XmmRmRVex { + op, + src3: XmmMem::new(src3).unwrap(), + src2: Xmm::new(src2).unwrap(), + src1: Xmm::new(dst.to_reg()).unwrap(), + dst: WritableXmm::from_writable_reg(dst).unwrap(), + } + } + pub(crate) fn xmm_rm_r_evex( op: Avx512Opcode, src1: RegMem, @@ -1136,6 +1152,29 @@ impl PrettyPrint for Inst { format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst) } + Inst::XmmRmRVex { + op, + src1, + src2, + src3, + dst, + .. + } => { + let src1 = pretty_print_reg(src1.to_reg(), 8, allocs); + let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs); + let src2 = pretty_print_reg(src2.to_reg(), 8, allocs); + let src3 = src3.pretty_print(8, allocs); + + format!( + "{} {}, {}, {}, {}", + ljustify(op.to_string()), + src1, + src2, + src3, + dst + ) + } + Inst::XmmRmREvex { op, src1, @@ -1840,6 +1879,24 @@ fn x64_get_operands VReg>(inst: &Inst, collector: &mut OperandCol } } } + Inst::XmmRmRVex { + op, + src1, + src2, + src3, + dst, + .. + } => { + // Vfmadd uses and defs the dst reg, that is not the case with all + // AVX's ops, if you're adding a new op, make sure to correctly define + // register uses. + assert!(*op == AvxOpcode::Vfmadd213ps || *op == AvxOpcode::Vfmadd213pd); + + collector.reg_use(src1.to_reg()); + collector.reg_reuse_def(dst.to_writable_reg(), 0); + collector.reg_use(src2.to_reg()); + src3.get_operands(collector); + } Inst::XmmRmREvex { op, src1, diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index 4c388c533c63..876b83650423 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -2566,6 +2566,13 @@ (rule (lower (has_type $F64X2 (fmax_pseudo x y))) (x64_maxpd y x)) +;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type $F32X4 (fma x y z))) + (x64_vfmadd213ps x y z)) +(rule (lower (has_type $F64X2 (fma x y z))) + (x64_vfmadd213pd x y z)) + ;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; In order to load a value from memory to a GPR register, we may need to extend diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 8caba83e9ecf..53867dd54b09 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -2832,7 +2832,7 @@ fn lower_insn_to_regs>( Opcode::Cls => unimplemented!("Cls not supported"), - Opcode::Fma => unimplemented!("Fma not supported"), + Opcode::Fma => implemented_in_isle(ctx), Opcode::BorNot | Opcode::BxorNot => { unimplemented!("or-not / xor-not opcodes not implemented"); diff --git a/cranelift/filetests/filetests/runtests/simd-fma.clif b/cranelift/filetests/filetests/runtests/simd-fma.clif new file mode 100644 index 000000000000..b5eb7de5b577 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-fma.clif @@ -0,0 +1,85 @@ +test run +target x86_64 has_avx has_fma + +function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = fma v0, v1, v2 + return v3 +} +; run: %fma_f32x4([0x9.0 0x83.0 0x1.99999ap-2 -0x1.4cccccp0], [0x9.0 0x2.68091p6 0x1.333334p-1 -0x1.666666p1], [0x9.0 0x9.88721p1 0x1.400000p1 -0x1.b33334p0]) == [0x1.680000p6 0x1.3b88e6p14 0x1.5eb852p1 0x1.f0a3d2p0] + +; Zeroes +; run: %fma_f32x4([0x0.0 0x0.0 0x0.0 -0x0.0], [0x0.0 0x0.0 -0x0.0 0x0.0], [0x0.0 -0x0.0 0x0.0 0x0.0]) == [0x0.0 0x0.0 0x0.0 0x0.0] + +; Infinites +; run: %fma_f32x4([-Inf Inf -Inf Inf], [-Inf -Inf Inf -Inf], [0x0.0 0x0.0 0x0.0 -Inf]) == [Inf -Inf -Inf -Inf] +; run: %fma_f32x4([-Inf 0x0.0 0x0.0 0x0.0], [Inf 0x0.0 0x0.0 0x0.0], [-Inf 0x0.0 0x0.0 0x0.0]) == [-Inf 0x0.0 0x0.0 0x0.0] + +; F32 Epsilon / Max / Min Positive +; run: %fma_f32x4([0x1.000000p-23 0x0.0 0x1.fffffep127 0x0.0], [0x1.000000p-23 0x0.0 0x1.fffffep127 0x0.0], [0x1.000000p-23 0x1.000000p-23 0x1.fffffep127 0x1.fffffep127]) == [0x1.000002p-23 0x1.000000p-23 +Inf 0x1.fffffep127] +; run: %fma_f32x4([0x1.000000p-126 0x0.0 0x0.0 0x0.0], [0x1.000000p-126 0x0.0 0x0.0 0x0.0], [0x1.000000p-126 0x1.000000p-126 0x0.0 0x0.0]) == [0x1.000000p-126 0x1.000000p-126 0x0.0 0x0.0] + +; F32 Subnormals +; run: %fma_f32x4([0x0.800000p-126 0x0.800000p-126 0x0.0 0x0.000002p-126], [0x0.800000p-126 0x0.800000p-126 0x0.0 0x0.000002p-126], [0x0.800000p-126 0x0.0 0x0.000002p-126 0x0.000002p-126]) == [0x0.800000p-126 0x0.0 0x0.000002p-126 0x0.000002p-126] +; run: %fma_f32x4([0x0.000002p-126 0x0.0 0x0.0 0x0.0], [0x0.000002p-126 0x0.0 0x0.0 0x0.0], [0x0.0 0x0.000002p-126 0x0.0 0x0.0]) == [0x0.0 0x0.000002p-126 0x0.0 0x0.0] + + + +;; The IEEE754 Standard does not make a lot of guarantees about what +;; comes out of NaN producing operations, we just check if its a NaN +function %fma_is_nan_f32x4(f32x4, f32x4, f32x4) -> b1 { +block0(v0: f32x4, v1: f32x4, v2: f32x4): + v3 = fma v0, v1, v2 + v4 = fcmp ne v3, v3 + v5 = vall_true v4 + return v5 +} +; run: %fma_is_nan_f32x4([Inf -Inf -Inf +NaN], [-Inf Inf -Inf 0x0.0], [Inf Inf -Inf 0x0.0]) == true +; run: %fma_is_nan_f32x4([0x0.0 0x0.0 -NaN 0x0.0], [+NaN 0x0.0 0x0.0 -NaN], [0x0.0 +NaN 0x0.0 0x0.0]) == true +; run: %fma_is_nan_f32x4([0x0.0 NaN NaN NaN], [0x0.0 NaN NaN NaN], [-NaN NaN NaN NaN]) == true + + + + + +function %fma_f64x2(f64x2, f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = fma v0, v1, v2 + return v3 +} +; run: %fma_f64x2([0x9.0 0x1.3b88ea148dd4ap14], [0x9.0 0x2.680916809121p6], [0x9.0 0x9.887218721837p1]) == [0x1.680000p6 0x1.7ba6ebee17417p21] + +; Zeroes +; run: %fma_f64x2([0x0.0 0x0.0], [0x0.0 0x0.0], [0x0.0 -0x0.0]) == [0x0.0 0x0.0] +; run: %fma_f64x2([0x0.0 -0x0.0], [-0x0.0 0x0.0], [0x0.0 0x0.0]) == [0x0.0 0x0.0] + +; Infinites +; run: %fma_f64x2([-Inf Inf], [-Inf -Inf], [0x0.0 0x0.0]) == [+Inf -Inf] +; run: %fma_f64x2([-Inf Inf], [Inf -Inf], [0x0.0 -Inf]) == [-Inf -Inf] +; run: %fma_f64x2([-Inf Inf], [Inf Inf], [-Inf Inf]) == [-Inf Inf] + +; F64 Epsilon / Max / Min Positive +; run: %fma_f64x2([0x1.0p-52 0x0.0], [0x1.0p-52 0x0.0], [0x1.0p-52 0x1.0p-52]) == [0x1.0000000000001p-52 0x1.0p-52] +; run: %fma_f64x2([0x1.fffffffffffffp1023 0x0.0], [0x1.fffffffffffffp1023 0x0.0], [0x1.fffffffffffffp1023 0x1.fffffffffffffp1023]) == [+Inf 0x1.fffffffffffffp1023] +; run: %fma_f64x2([0x1.0p-1022 0x0.0], [0x1.0p-1022 0x0.0], [0x1.0p-1022 0x1.0p-1022]) == [0x1.0p-1022 0x1.0p-1022] + +; F64 Subnormals +; run: %fma_f64x2([0x0.8p-1022 0x0.8p-1022], [0x0.8p-1022 0x0.8p-1022], [0x0.8p-1022 0x0.0]) == [0x0.8p-1022 0x0.0] +; run: %fma_f64x2([0x0.0 0x0.0000000000001p-1022], [0x0.0 0x0.0000000000001p-1022], [0x0.8p-1022 0x0.0000000000001p-1022]) == [0x0.8p-1022 0x0.0000000000001p-1022] +; run: %fma_f64x2([0x0.0000000000001p-1022 0x0.0], [0x0.0000000000001p-1022 0x0.0], [0x0.0 0x0.0000000000001p-1022]) == [0x0.0 0x0.0000000000001p-1022] + + +;; The IEEE754 Standard does not make a lot of guarantees about what +;; comes out of NaN producing operations, we just check if its a NaN +function %fma_is_nan_f64x2(f64x2, f64x2, f64x2) -> b1 { +block0(v0: f64x2, v1: f64x2, v2: f64x2): + v3 = fma v0, v1, v2 + v4 = fcmp ne v3, v3 + v5 = vall_true v4 + return v5 +} +; run: %fma_is_nan_f64x2([Inf -Inf], [-Inf Inf], [Inf Inf]) == true +; run: %fma_is_nan_f64x2([-Inf +NaN], [-Inf 0x0.0], [-Inf 0x0.0]) == true +; run: %fma_is_nan_f64x2([0x0.0 0x0.0], [+NaN 0x0.0], [0x0.0 +NaN]) == true +; run: %fma_is_nan_f64x2([-NaN 0x0.0], [0x0.0 -NaN], [0x0.0 0x0.0]) == true +; run: %fma_is_nan_f64x2([0x0.0 NaN], [0x0.0 NaN], [-NaN NaN]) == true diff --git a/cranelift/native/src/lib.rs b/cranelift/native/src/lib.rs index 2bae5206c284..5f05c583417d 100644 --- a/cranelift/native/src/lib.rs +++ b/cranelift/native/src/lib.rs @@ -92,6 +92,9 @@ pub fn builder_with_options(infer_native_flags: bool) -> Result Arbitrary<'a> for CodegenSettings { std:"popcnt" => clif:"has_popcnt", std:"avx" => clif:"has_avx", std:"avx2" => clif:"has_avx2", + std:"fma" => clif:"has_fma", std:"bmi1" => clif:"has_bmi1", std:"bmi2" => clif:"has_bmi2", std:"lzcnt" => clif:"has_lzcnt", diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs index 30a0c40d30fc..2b726c8ad0d8 100644 --- a/crates/wasmtime/src/engine.rs +++ b/crates/wasmtime/src/engine.rs @@ -486,6 +486,7 @@ impl Engine { "has_popcnt" => Some(std::is_x86_feature_detected!("popcnt")), "has_avx" => Some(std::is_x86_feature_detected!("avx")), "has_avx2" => Some(std::is_x86_feature_detected!("avx2")), + "has_fma" => Some(std::is_x86_feature_detected!("fma")), "has_bmi1" => Some(std::is_x86_feature_detected!("bmi1")), "has_bmi2" => Some(std::is_x86_feature_detected!("bmi2")), "has_avx512bitalg" => Some(std::is_x86_feature_detected!("avx512bitalg")), diff --git a/src/commands/compile.rs b/src/commands/compile.rs index 3cc96e803ea6..9b54d3ae74d3 100644 --- a/src/commands/compile.rs +++ b/src/commands/compile.rs @@ -155,6 +155,8 @@ mod test { "--cranelift-enable", "has_avx2", "--cranelift-enable", + "has_fma", + "--cranelift-enable", "has_avx512dq", "--cranelift-enable", "has_avx512vl",