Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU: Builtins & Codegen support for v_cvt_scalef32_pk_{fp8|bf8}_f32 for gfx950 #117740

Merged

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Nov 26, 2024

OPSEL[3] determines low/high 16 bits of word to write.

Co-authored-by: Pravin Jagtap [email protected]

This was referenced Nov 26, 2024
@llvmbot llvmbot added clang:frontend Language frontend issues, e.g. anything involving "Sema" llvm:ir labels Nov 26, 2024
@llvmbot
Copy link
Member

llvmbot commented Nov 26, 2024

@llvm/pr-subscribers-clang
@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

OPSEL[3] determines low/high 16 bits of word to write.

Co-authored-by: Pravin Jagtap <[email protected]>


Patch is 28.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117740.diff

11 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+2-1)
  • (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl (+4-1)
  • (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl (+70)
  • (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl (+5-1)
  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+14)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+7)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+3)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2)
  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
  • (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+32-4)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll (+126)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 829a936537575b..a292640b7c4f21 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -574,6 +574,7 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f16_fp8, "V2hV2hifIiIb", "nc", "fp8
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f16_bf8, "V2hV2hifIiIb", "nc", "bf8-cvt-scale-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f32_fp8, "fifIi", "nc", "fp8-cvt-scale-insts")
 TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f32_bf8, "fifIi", "nc", "bf8-cvt-scale-insts")
-
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_fp8_f32, "V2sV2sfffIb", "nc", "fp8-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk_bf8_f32, "V2sV2sfffIb", "nc", "bf8-cvt-scale-insts")
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
index cc21c119ec14de..5ec769dc6a84bc 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
@@ -13,8 +13,9 @@
 typedef unsigned int uint;
 typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
 typedef half __attribute__((ext_vector_type(2))) half2;
+typedef short __attribute__((ext_vector_type(2))) short2;
 
-void test(global uint* out, global uint2* out_v2u32, uint a, uint b, global half2* out_v2f16, global float* out_f32, float scale) {
+void test(global uint* out, global uint2* out_v2u32, uint a, uint b, global half2* out_v2f16, global float* out_f32, float scale, global short2* out_v2i16, float src0, float src1) {
   *out = __builtin_amdgcn_prng_b32(a); // expected-error{{'__builtin_amdgcn_prng_b32' needs target feature prng-inst}}
   *out_v2u32 = __builtin_amdgcn_permlane16_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}}
   *out_v2u32 = __builtin_amdgcn_permlane32_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}}
@@ -22,4 +23,6 @@ void test(global uint* out, global uint2* out_v2u32, uint a, uint b, global half
   *out_f32 = __builtin_amdgcn_cvt_scalef32_f32_fp8(a, scale, 0); // expected-error{{'__builtin_amdgcn_cvt_scalef32_f32_fp8' needs target feature fp8-cvt-scale-insts}}
   *out_v2f16 = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out_v2f16, a, scale, 0, false); // expected-error{{'__builtin_amdgcn_cvt_scalef32_f16_bf8' needs target feature bf8-cvt-scale-insts}}
   *out_f32 = __builtin_amdgcn_cvt_scalef32_f32_bf8(a, scale, 0); // expected-error{{'__builtin_amdgcn_cvt_scalef32_f32_bf8' needs target feature bf8-cvt-scale-insts}}
+  *out_v2i16 = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out_v2i16, src0, src1, scale, true); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_fp8_f32' needs target feature fp8-cvt-scale-insts}}
+  *out_v2i16 = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out_v2i16, src0, src1, scale, true); // expected-error{{'__builtin_amdgcn_cvt_scalef32_pk_bf8_f32' needs target feature bf8-cvt-scale-insts}}
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index 57e2568a813920..9f23474226791c 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -497,3 +497,73 @@ void test_cvt_scalef32_f32_bf8(global float* out, uint src, float scale)
   *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 2);
   *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 3);
 }
+
+
+// CHECK-LABEL: @test_cvt_scalef32_pk_fp8_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i1 true)
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store <2 x i16> [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP7]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store <2 x i16> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_cvt_scalef32_pk_fp8_f32(global short2* out, float src0, float src1, float scale)
+{
+  *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out, src0, src1, scale, true);
+  *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out, src0, src1, scale, false);
+}
+
+
+// CHECK-LABEL: @test_cvt_scalef32_pk_bf8_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i1 true)
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store <2 x i16> [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP7]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store <2 x i16> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_cvt_scalef32_pk_bf8_f32(global short2* out, float src0, float src1, float scale)
+{
+  *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out, src0, src1, scale, true);
+  *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out, src0, src1, scale, false);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index a78293eddc0074..fa896f9be782ed 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -12,6 +12,7 @@ typedef int int8 __attribute__((ext_vector_type(8)));
 typedef int int16 __attribute__((ext_vector_type(16)));
 typedef unsigned int uint;
 typedef half half2 __attribute__((ext_vector_type(2)));
+typedef short short2 __attribute__((ext_vector_type(2)));
 
 void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
 
@@ -160,9 +161,12 @@ void test_permlane32_swap(__global int* out, int old, int src, bool X) {
   *out = __builtin_amdgcn_permlane32_swap(old, src, false, X); // expected-error{{argument to '__builtin_amdgcn_permlane32_swap' must be a constant integer}}
 }
 
-void test_cvt_scalef32(global half2* out_v2f16, global float* out_f32, uint src, float scale, int index, bool X) {
+void test_cvt_scalef32(global half2* out_v2f16, global float* out_f32, uint src, float scale, int index, bool X,
+                       global short2* out_v2i16, float src0, float src1) {
   *out_v2f16 = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out_v2f16, src, scale, index, X); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_f16_fp8' must be a constant integer}}
   *out_f32 = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, index); // // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_f32_fp8' must be a constant integer}}
   *out_v2f16 = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out_v2f16, src, scale, index, X); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_f16_bf8' must be a constant integer}}
   *out_f32 = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, index); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_f32_bf8' must be a constant integer}}
+  *out_v2i16 = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out_v2i16, src0, src1, scale, X); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_pk_fp8_f32' must be a constant integer}}
+  *out_v2i16 = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out_v2i16, src0, src1, scale, X); // expected-error{{argument to '__builtin_amdgcn_cvt_scalef32_pk_bf8_f32' must be a constant integer}}
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 7dc7fd3f4aef6e..b8e2f0d07387fa 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -617,6 +617,16 @@ class AMDGPUCvtScaleFP8BF8ToF32Intrinsic<LLVMType DstTy, string name> : DefaultA
   [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<2>>]
 >, ClangBuiltin<"__builtin_amdgcn_"#name>;
 
+class AMDGPUCvtScaleF32ToFP8BF8TiedInputIntrinsic<string name> : DefaultAttrsIntrinsic<
+  [llvm_v2i16_ty],
+  [llvm_v2i16_ty, // old_vdst
+   llvm_float_ty, // src0
+   llvm_float_ty, // src1
+   llvm_float_ty, // scale
+   llvm_i1_ty],   // dst_lo_hi_sel[true false]
+  [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<4>>]
+>, ClangBuiltin<"__builtin_amdgcn_"#name>;
+
 class AMDGPUCvtScaleFP8BF8ToF16TiedInputIntrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic<
   [DstTy],
   [llvm_v2f16_ty, // old_vdst
@@ -635,6 +645,10 @@ def int_amdgcn_cvt_scalef32_f16_bf8  : AMDGPUCvtScaleFP8BF8ToF16TiedInputIntrins
 def int_amdgcn_cvt_scalef32_f32_fp8  : AMDGPUCvtScaleFP8BF8ToF32Intrinsic<llvm_float_ty, "cvt_scalef32_f32_fp8">;
 def int_amdgcn_cvt_scalef32_f32_bf8  : AMDGPUCvtScaleFP8BF8ToF32Intrinsic<llvm_float_ty, "cvt_scalef32_f32_bf8">;
 
+// llvm.amdgcn.cvt.scalef32.pk.fp8.f32 v2i16 old_vdst, float srcA, float srcB, float scale, bool dst_lo_hi_sel
+def int_amdgcn_cvt_scalef32_pk_fp8_f32 : AMDGPUCvtScaleF32ToFP8BF8TiedInputIntrinsic<"cvt_scalef32_pk_fp8_f32">;
+def int_amdgcn_cvt_scalef32_pk_bf8_f32 : AMDGPUCvtScaleF32ToFP8BF8TiedInputIntrinsic<"cvt_scalef32_pk_bf8_f32">;
+
 def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic<
   [llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]
 >, ClangBuiltin<"__builtin_amdgcn_prng_b32">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index bddade242e5ccc..af0bd11183579f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5846,6 +5846,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
                  : 0);
 }
 
+void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
+    MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
+  assert(OpIdx >= 0 && "expected to match an immediate operand");
+  MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
+                                           : 0);
+}
+
 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
                                                   const MachineInstr &MI,
                                                   int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 4ba86bfa854257..2fe96915c57cef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -351,6 +351,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
                                           const MachineInstr &MI,
                                           int OpIdx) const;
 
+  void renderDstSelToOpSelXForm(MachineInstrBuilder &MIB,
+                                const MachineInstr &MI, int OpIdx) const;
+
   void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                        int OpIdx) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 637c2e679ab1c8..51c539fac26132 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4551,6 +4551,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_cvt_scalef32_f16_bf8:
     case Intrinsic::amdgcn_cvt_scalef32_f32_fp8:
     case Intrinsic::amdgcn_cvt_scalef32_f32_bf8:
+    case Intrinsic::amdgcn_cvt_scalef32_pk_fp8_f32:
+    case Intrinsic::amdgcn_cvt_scalef32_pk_bf8_f32:
     case Intrinsic::amdgcn_ashr_pk_i8_i32:
     case Intrinsic::amdgcn_ashr_pk_u8_i32:
     case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 3e25237a760256..a77e96d3986bac 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2842,6 +2842,7 @@ def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
 def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
 def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;
 def VOP_V2F16_I32_F32 : VOPProfile<[v2f16, i32, f32, untyped]>;
+def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;
 
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 20482fb9c7a943..8511211fd64eef 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -869,6 +869,14 @@ def gi_SrcAndDstSelToOpSelXForm_1_0 : GICustomOperandRenderer<"renderSrcAndDstSe
 def gi_SrcAndDstSelToOpSelXForm_1_1 : GICustomOperandRenderer<"renderSrcAndDstSelToOpSelXForm_1_1">,
   GISDNodeXFormEquiv<SrcAndDstSelToOpSelXForm_1_1>;
 
+def DstSelToOpSelXForm : SDNodeXForm<timm, [{
+  return CurDAG->getTargetConstant(
+      N->getZExtValue() ? SISrcMods::DST_OP_SEL : SISrcMods::NONE,
+      SDLoc(N), MVT::i32);
+}]>;
+def gi_DstSelToOpSelXForm : GICustomOperandRenderer<"renderDstSelToOpSelXForm">,
+  GISDNodeXFormEquiv<DstSelToOpSelXForm>;
+
 class PermlanePat<SDPatternOperator permlane,
   Instruction inst, ValueType vt> : GCNPat<
   (vt (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2,
@@ -940,6 +948,17 @@ def VOP3_CVT_SCALE_FP4FP8BF8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, f3
   let HasOMod = 0;
 }
 
+class VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profile<P, VOP3_OPSEL> {
+  let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
+                          FP32InputMods:$src1_modifiers, Src1RC64:$src1,
+                          FP32InputMods:$src2_modifiers, Src2RC64:$src2,
+                          VGPR_32:$vdst_in, op_sel0:$op_sel);
+  let HasClamp = 0;
+  let HasExtVOP3DPP = 0;
+  let HasOpSel = 1;
+  let HasOMod = 0;
+}
+
 def VOP3_CVT_SCALE_FP4_F16BF16_Profile : VOP3_Profile<VOPProfile<[i32, v2f16, f32, f32]>,
                                               VOP3_OPSEL> {
   let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
@@ -997,10 +1016,11 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
 }
 
 let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
-  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in
+  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
     defm V_CVT_SCALEF32_F16_FP8 : VOP3Inst<"v_cvt_scalef32_f16_fp8", VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<VOP_V2F16_I32_F32>>;
+    defm V_CVT_SCALEF32_PK_FP8_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp8_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_V2I16_F32_F32_F32>>;
+  }
   defm V_CVT_SCALEF32_F32_FP8 : VOP3Inst<"v_cvt_scalef32_f32_fp8", VOP3_CVT_SCALE_F1632_FP8BF8_Profile<f32>>;
-  defm V_CVT_SCALEF32_PK_FP8_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp8_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_Profile>;
   defm V_CVT_SCALEF32_PK_F32_FP8 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp8", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
   defm V_CVT_SCALEF32_PK_FP8_F16 : VOP3Inst<"v_cvt_scalef32_pk_fp8_f16", VOP3_CVT_SCALE_PK_FP8BF8_F16BF16_Profile>;
   defm V_CVT_SCALEF32_PK_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_pk_fp8_bf16", VOP3_CVT_SCALE_PK_FP8BF8_F16BF16_Profile>;
@@ -1009,10 +1029,11 @@ let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in
 }
 
 let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in {
-  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in
+  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
     defm V_CVT_SCALEF32_F16_BF8 : VOP3Inst<"v_cvt_scalef32_f16_bf8", VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<VOP_V2F16_I32_F32>>;
+    defm V_CVT_SCALEF32_PK_BF8_F32 : VOP3Inst<"v_cvt_scalef32_pk_bf8_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_V2I16_F32_F32_F32>>;
+  }
   defm V_CVT_SCALEF32_F32_BF8 : VOP3Inst<"v_cvt_scalef32_f32_bf8", VOP3_CVT_SCALE_F1632_FP8BF8_Profile<f32>>;
-  defm V_CVT_SCALEF32_PK_BF8_F32 : VOP3Inst<"v_cvt_scalef32_pk_bf8_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_Profile>;
   defm V_CVT_SCALEF32_PK_F32_BF8 : VOP3Inst<"v_cvt_scalef32_pk_f32_bf8", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
   defm V_CVT_SCALEF32_PK_BF8_F16 : VOP3Inst<"v_cvt_scalef32_pk_bf8_f16", VOP3_CVT_SCALE_PK_FP8BF8_F16BF16_Profile>;
   defm V_CVT_SCALEF32_PK_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_pk_bf8_bf16", VOP3_CVT_SCALE_PK_FP8BF8_F16BF16_Profile>;
@@ -1072,6 +1093,13 @@ foreach Dst...
[truncated]

Copy link
Contributor Author

arsenm commented Nov 26, 2024

Merge activity

  • Nov 26, 2:41 PM EST: A user started a stack merge that includes this pull request via Graphite.
  • Nov 26, 2:55 PM EST: Graphite rebased this pull request as part of a merge.
  • Nov 26, 2:57 PM EST: A user merged this pull request with Graphite.

@arsenm arsenm force-pushed the users/arsenm/gfx950/codegen-v_cvt_scalef32_f16_f32_fp8_bf8 branch from c6bd09b to d0df1b6 Compare November 26, 2024 19:51
Base automatically changed from users/arsenm/gfx950/codegen-v_cvt_scalef32_f16_f32_fp8_bf8 to main November 26, 2024 19:54
…2 for gfx950

OPSEL[3] determines low/high 16 bits of word to write.

Co-authored-by: Pravin Jagtap <[email protected]>
@arsenm arsenm force-pushed the users/arsenm/gfx950/codegen-v_cvt_scalef32_pk_fp8_bf8_f32 branch from 60c4d95 to 574c5b5 Compare November 26, 2024 19:54
@arsenm arsenm merged commit 803bd81 into main Nov 26, 2024
5 of 7 checks passed
@arsenm arsenm deleted the users/arsenm/gfx950/codegen-v_cvt_scalef32_pk_fp8_bf8_f32 branch November 26, 2024 19:57
This was referenced Nov 26, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category llvm:ir
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants