diff --git a/external/llvm-project/mlir/include/mlir/Conversion/ConvertToLLVM/ToLLVMPass.h b/external/llvm-project/mlir/include/mlir/Conversion/ConvertToLLVM/ToLLVMPass.h index 73deef49c417..eb6729d43e21 100644 --- a/external/llvm-project/mlir/include/mlir/Conversion/ConvertToLLVM/ToLLVMPass.h +++ b/external/llvm-project/mlir/include/mlir/Conversion/ConvertToLLVM/ToLLVMPass.h @@ -21,6 +21,8 @@ namespace mlir { /// Create a pass that performs dialect conversion to LLVM for all dialects /// implementing `ConvertToLLVMPatternInterface`. std::unique_ptr createConvertToLLVMPass(); +std::unique_ptr createConvertToLLVMPass(unsigned indexBitwidth, + bool useBarePtrCallConv); /// Register the extension that will load dependent dialects for LLVM /// conversion. This is useful to implement a pass similar to "convert-to-llvm". diff --git a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h index 564778771299..f1233ad894da 100644 --- a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h +++ b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h @@ -42,7 +42,7 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target); /// is configurable. std::unique_ptr> createLowerGpuOpsToROCDLOpsPass( - const std::string &chipset = "gfx900", + const std::string &chipset = "infer", unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv = false, gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown); diff --git a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td index e52c7ff6bd56..82a23fa3da5d 100644 --- a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td +++ b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td @@ -28,6 +28,11 @@ def ConvertToLLVMPass : Pass<"convert-to-llvm"> { let options = [ ListOption<"filterDialects", "filter-dialects", "std::string", "Test conversion patterns of only the specified dialects">, + Option<"useBarePtrCallConv", "use-bare-ptr-call-conv", "bool", "false", "Whether memrefs can be converted to bare ptr">, + Option<"indexBitwidth", "index-bitwidth", "unsigned", + /*default=kDeriveIndexBitwidthFromDataLayout*/"0", + "Bitwidth of the index type, 0 to use size of machine word">, + ]; } @@ -589,11 +594,12 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> { "ROCDL::ROCDLDialect", "cf::ControlFlowDialect", "memref::MemRefDialect", + "ptr::PtrDialect", ]; let options = [ Option<"chipset", "chipset", "std::string", - /*default=*/"\"gfx000\"", - "Chipset that these operations will run on">, + /*default=*/"\"infer\"", + "Chipset that these operations will run on. By default it will infer target from attached Target Attribute on GPU Module">, Option<"indexBitwidth", "index-bitwidth", "unsigned", /*default=kDeriveIndexBitwidthFromDataLayout*/"0", "Bitwidth of the index type, 0 to use size of machine word">, diff --git a/external/llvm-project/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td b/external/llvm-project/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td index e75038f300f1..9ebf37e83041 100644 --- a/external/llvm-project/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td +++ b/external/llvm-project/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td @@ -35,6 +35,8 @@ def Ptr_SpecAttr : Ptr_Attr<"Spec", "spec"> { - [Optional] index: bitwidth that should be used when performing index computations for the type. Setting the field to `kOptionalSpecValue`, means the field is optional. + - [Optional] llvmAddressSpace : Mapping from AddressSpace of ptr.PtrType's adddress Space to LLVM's address space. + Setting the field to 'kOptionalLLVMAddressSpaceValue`, means the field is optional. Furthermore, the attribute will verify that all present values are divisible by 8 (number of bits in a byte), and that `preferred` > `abi`. @@ -43,26 +45,28 @@ def Ptr_SpecAttr : Ptr_Attr<"Spec", "spec"> { ```mlir // Spec for a 64 bit ptr, with a required alignment of 64 bits, but with // a preferred alignment of 128 bits and an index bitwidth of 64 bits. - #ptr.spec + #ptr.spec ``` }]; let parameters = (ins "uint32_t":$size, "uint32_t":$abi, "uint32_t":$preferred, - DefaultValuedParameter<"uint32_t", "kOptionalSpecValue">:$index + DefaultValuedParameter<"uint32_t", "kOptionalSpecValue">:$index, + DefaultValuedParameter<"uint32_t", "kOptionalLLVMAddressSpaceValue">:$llvmAddressSpace ); let skipDefaultBuilders = 1; let builders = [ AttrBuilder<(ins "uint32_t":$size, "uint32_t":$abi, "uint32_t":$preferred, - CArg<"uint32_t", "kOptionalSpecValue">:$index), [{ - return $_get($_ctxt, size, abi, preferred, index); + CArg<"uint32_t", "kOptionalSpecValue">:$index, CArg<"uint32_t", "kOptionalLLVMAddressSpaceValue">:$llvmAddressSpace), [{ + return $_get($_ctxt, size, abi, preferred, index, llvmAddressSpace); }]> ]; let assemblyFormat = "`<` struct(params) `>`"; let extraClassDeclaration = [{ /// Constant for specifying a spec entry is optional. static constexpr uint32_t kOptionalSpecValue = std::numeric_limits::max(); + static constexpr uint32_t kOptionalLLVMAddressSpaceValue = 0; }]; let genVerifyDecl = 1; } diff --git a/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt b/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt index df7e3f995303..b06d9e03a80c 100644 --- a/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt +++ b/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt @@ -24,6 +24,7 @@ add_mlir_conversion_library(MLIRConvertToLLVMPass MLIRIR MLIRLLVMCommonConversion MLIRLLVMDialect + MLIRPtrDialect MLIRPass MLIRRewrite MLIRSupport diff --git a/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp b/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp index b2407a258c27..1e334a057cd1 100644 --- a/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp +++ b/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp @@ -6,12 +6,18 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Analysis/DataLayoutAnalysis.h" #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h" #include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" +#include "mlir/Conversion/LLVMCommon/LoweringOptions.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Ptr/IR/PtrAttrs.h" +#include "mlir/Dialect/Ptr/IR/PtrDialect.h" +#include "mlir/Dialect/Ptr/IR/PtrTypes.h" #include "mlir/IR/PatternMatch.h" +#include "mlir/Interfaces/DataLayoutInterfaces.h" #include "mlir/Pass/Pass.h" #include "mlir/Rewrite/FrozenRewritePatternSet.h" #include "mlir/Transforms/DialectConversion.h" @@ -69,16 +75,82 @@ class ConvertToLLVMPass public: using impl::ConvertToLLVMPassBase::ConvertToLLVMPassBase; + ConvertToLLVMPass() = default; + ConvertToLLVMPass(unsigned indexBitwidth, bool useBarePtrCallConv) { + if (this->indexBitwidth.getNumOccurrences() == 0) + this->indexBitwidth = indexBitwidth; + if (this->useBarePtrCallConv.getNumOccurrences() == 0) + this->useBarePtrCallConv = useBarePtrCallConv; + } + void getDependentDialects(DialectRegistry ®istry) const final { registry.insert(); + registry.insert(); registry.addExtensions(); } - LogicalResult initialize(MLIRContext *context) final { + LogicalResult initialize(MLIRContext *context) final { return success(); } + + void runOnOperation() final { + auto *op = getOperation(); + auto *context = op->getContext(); + StringRef dataLayout; + auto dataLayoutAttr = dyn_cast_or_null( + op->getAttr(LLVM::LLVMDialect::getDataLayoutAttrName())); + if (dataLayoutAttr) + dataLayout = dataLayoutAttr.getValue(); + + if (failed(LLVM::LLVMDialect::verifyDataLayoutString( + dataLayout, [this](const Twine &message) { + getOperation()->emitError() << message.str(); + }))) { + signalPassFailure(); + return; + } + + const DataLayoutAnalysis &dataLayoutAnalysis = + getAnalysis(); + LowerToLLVMOptions options(context, + dataLayoutAnalysis.getAtOrAbove(op)); + options.useBarePtrCallConv = useBarePtrCallConv; + if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout) + options.overrideIndexBitwidth(indexBitwidth); + options.dataLayout = llvm::DataLayout(dataLayout); + if (useBarePtrCallConv) { + options.useBarePtrCallConv = true; + } + RewritePatternSet tempPatterns(context); auto target = std::make_shared(*context); target->addLegalDialect(); - auto typeConverter = std::make_shared(context); + auto typeConverter = std::make_shared(context, options); + + DenseMap addressSpaceMap; + if (DataLayoutOpInterface iface = dyn_cast(op)) { + if (DataLayoutSpecInterface dlSpec = iface.getDataLayoutSpec()) { + for (DataLayoutEntryInterface entry : dlSpec.getEntries()) { + ptr::PtrType ptrKey = llvm::dyn_cast_or_null( + entry.getKey().get()); + if (!ptrKey) { + continue; + } + Attribute addressSpace = ptrKey.getMemorySpace(); + auto value = + cast(entry.getValue()).getLlvmAddressSpace(); + addressSpaceMap.insert({addressSpace, value}); + } + } + typeConverter->addTypeAttributeConversion( + [addressSpaceMap](BaseMemRefType type, Attribute memorySpaceAttr) { + unsigned llvmAddressSpace = 0; + if (addressSpaceMap.contains(memorySpaceAttr)) { + llvmAddressSpace = addressSpaceMap.at(memorySpaceAttr); + } + return IntegerAttr::get( + IntegerType::get(memorySpaceAttr.getContext(), 64), + llvmAddressSpace); + }); + } if (!filterDialects.empty()) { // Test mode: Populate only patterns from the specified dialects. Produce @@ -86,14 +158,19 @@ class ConvertToLLVMPass // interface. for (std::string &dialectName : filterDialects) { Dialect *dialect = context->getLoadedDialect(dialectName); - if (!dialect) - return emitError(UnknownLoc::get(context)) - << "dialect not loaded: " << dialectName << "\n"; + if (!dialect) { + emitError(UnknownLoc::get(context)) + << "dialect not loaded: " << dialectName << "\n"; + signalPassFailure(); + } auto *iface = dyn_cast(dialect); - if (!iface) - return emitError(UnknownLoc::get(context)) - << "dialect does not implement ConvertToLLVMPatternInterface: " - << dialectName << "\n"; + if (!iface) { + emitError(UnknownLoc::get(context)) + << "dialect does not implement ConvertToLLVMPatternInterface: " + << dialectName << "\n"; + signalPassFailure(); + } + iface->populateConvertToLLVMConversionPatterns(*target, *typeConverter, tempPatterns); } @@ -110,15 +187,10 @@ class ConvertToLLVMPass tempPatterns); } } - this->patterns = std::make_unique(std::move(tempPatterns)); this->target = target; this->typeConverter = typeConverter; - return success(); - } - - void runOnOperation() final { if (failed(applyPartialConversion(getOperation(), *target, *patterns))) signalPassFailure(); } @@ -134,3 +206,8 @@ void mlir::registerConvertToLLVMDependentDialectLoading( std::unique_ptr mlir::createConvertToLLVMPass() { return std::make_unique(); } + +std::unique_ptr mlir::createConvertToLLVMPass(unsigned indexBitwidth, + bool useBarePtrCallConv) { + return std::make_unique(indexBitwidth, useBarePtrCallConv); +} diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt index 945e3ccdfa87..e8c4e9589142 100644 --- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt +++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt @@ -20,6 +20,7 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms MLIRGPUToGPURuntimeTransforms MLIRLLVMCommonConversion MLIRLLVMDialect + MLIRPtrDialect MLIRMemRefToLLVM MLIRROCDLDialect MLIRPass diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index 36fbf80c8156..e2280d60078a 100644 --- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -14,21 +14,21 @@ #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" #include "mlir/Dialect/Arith/Transforms/Passes.h" +#include "mlir/Dialect/DLTI/DLTI.h" +#include "mlir/Dialect/Ptr/IR/PtrAttrs.h" +#include "mlir/Dialect/Ptr/IR/PtrDialect.h" +#include "mlir/Dialect/Ptr/IR/PtrTypes.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Transforms/Passes.h" #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h" -#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" -#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" -#include "mlir/Conversion/MathToLLVM/MathToLLVM.h" #include "mlir/Conversion/MathToROCDL/MathToROCDL.h" -#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" #include "mlir/Dialect/Func/IR/FuncOps.h" @@ -36,18 +36,14 @@ #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" -#include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "llvm/Support/FormatVariadic.h" #include "../GPUCommon/GPUOpsLowering.h" #include "../GPUCommon/IndexIntrinsicsOpLowering.h" -#include "../GPUCommon/OpToFuncCallLowering.h" namespace mlir { #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS @@ -55,7 +51,6 @@ namespace mlir { } // namespace mlir #include "mlir/Dialect/LLVMIR/Transforms/Passes.h" - using namespace mlir; /// Returns true if the given `gpu.func` can be safely called using the bare @@ -219,6 +214,28 @@ struct LowerGpuOpsToROCDLOpsPass void runOnOperation() override { gpu::GPUModuleOp m = getOperation(); MLIRContext *ctx = m.getContext(); + OpBuilder b(ctx); + ArrayAttr targets = m.getTargetsAttr(); + if (chipset == "infer") { + if (!targets) { + emitError(UnknownLoc::get(ctx), + "ROCDLTargetAttr is empty on GPU module"); + return signalPassFailure(); + } + if (targets.size() != 1) { + emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified " + "more than one gpu-arch on GPU module"); + return signalPassFailure(); + } + const ROCDL::ROCDLTargetAttr targetAttr = + mlir::dyn_cast(targets.getValue().front()); + chipset = targetAttr.getChip().str(); + } + FailureOr maybeChipset = amdgpu::Chipset::parse(chipset); + if (failed(maybeChipset)) { + emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset); + return signalPassFailure(); + } auto llvmDataLayout = m->getAttrOfType( LLVM::LLVMDialect::getDataLayoutAttrName()); @@ -232,12 +249,6 @@ struct LowerGpuOpsToROCDLOpsPass UnitAttr::get(ctx)); } - FailureOr maybeChipset = amdgpu::Chipset::parse(chipset); - if (failed(maybeChipset)) { - emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset); - return signalPassFailure(); - } - /// Customize the bitwidth used for the device side index computations. LowerToLLVMOptions options( ctx, DataLayout(cast(m.getOperation()))); @@ -288,15 +299,35 @@ struct LowerGpuOpsToROCDLOpsPass RewritePatternSet llvmPatterns(ctx); - mlir::arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns); populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns, *maybeChipset); populateVectorToLLVMConversionPatterns(converter, llvmPatterns); - populateMathToLLVMConversionPatterns(converter, llvmPatterns); cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns); - populateFuncToLLVMConversionPatterns(converter, llvmPatterns); - populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns); populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime); + // ABI, PreferredAlignment, Size etc are set arbitarily here for now for the + // SpecAttr, SpecAttr is used for mapping to LLVM addressSpace + DataLayoutEntryInterface ptrProgramMemoryAttr = DataLayoutEntryAttr::get( + b.getType( + b.getAttr(gpu::AddressSpace::Workgroup)), + b.getAttr(32, 32, 32, 32, 3)); + DataLayoutEntryInterface ptrGlobalMemoryAttr = DataLayoutEntryAttr::get( + b.getType( + b.getAttr(gpu::AddressSpace::Global)), + b.getAttr(32, 32, 32, 32, 1)); + DataLayoutEntryInterface ptrAllocaMemoryAttr = DataLayoutEntryAttr::get( + b.getType( + b.getAttr(gpu::AddressSpace::Private)), + b.getAttr(32, 32, 32, 32, 5)); + + llvm::ArrayRef dltiAddressSpaceAttrs = { + ptrAllocaMemoryAttr, ptrGlobalMemoryAttr, ptrProgramMemoryAttr}; + DataLayoutSpecAttr dltiSpec = + b.getAttr(dltiAddressSpaceAttrs); + if (auto previousDltiSpec = m.getDataLayoutSpec()) { + dltiSpec = dltiSpec.combineWith(previousDltiSpec); + } + m->setAttr(DLTIDialect::kDataLayoutAttrName, dltiSpec); + LLVMConversionTarget target(getContext()); configureGpuToROCDLConversionLegality(target); if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) @@ -337,22 +368,12 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>(); // These ops are legal for f32 type. target.addDynamicallyLegalOp([](Operation *op) { - return any_of(op->getOperandTypes(), - llvm::IsaPred); + return any_of(op->getOperandTypes(), llvm::IsaPred); }); // TODO: Remove once we support replacing non-root ops. target.addLegalOp(); } -// namespace mlir -template -static void populateOpPatterns(LLVMTypeConverter &converter, - RewritePatternSet &patterns, StringRef f32Func, - StringRef f64Func, StringRef f16Func) { - patterns.add>(converter); - patterns.add>(converter, f32Func, f64Func, f16Func); -} - void mlir::populateGpuToROCDLConversionPatterns( LLVMTypeConverter &converter, RewritePatternSet &patterns, mlir::gpu::amd::Runtime runtime) { diff --git a/external/llvm-project/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp b/external/llvm-project/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp index f8ce820d0bcb..9474e2f65e2a 100644 --- a/external/llvm-project/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp +++ b/external/llvm-project/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp @@ -24,7 +24,7 @@ constexpr const static unsigned kBitsInByte = 8; LogicalResult SpecAttr::verify(function_ref emitError, uint32_t size, uint32_t abi, uint32_t preferred, - uint32_t index) { + uint32_t index, uint32_t llvmAddressSpace) { if (size % kBitsInByte != 0) return emitError() << "size entry must be divisible by 8"; if (abi % kBitsInByte != 0) diff --git a/mlir/include/mlir/InitRocMLIRDialects.h b/mlir/include/mlir/InitRocMLIRDialects.h index b4d822b7bb33..2a4727b0f2e5 100644 --- a/mlir/include/mlir/InitRocMLIRDialects.h +++ b/mlir/include/mlir/InitRocMLIRDialects.h @@ -15,7 +15,9 @@ #define MLIR_INITROCMLIRDIALECTS_H_ // rocMLIR includes +#include "mlir/Dialect/Index/IR/IndexDialect.h" #include "mlir/Dialect/MIGraphX/IR/MIGraphX.h" +#include "mlir/Dialect/Ptr/IR/PtrDialect.h" #include "mlir/Dialect/Rock/IR/Rock.h" #include "mlir/Dialect/Rock/Transforms/BufferizableOpInterfaceImpl.h" #include "mlir/InitRocMLIRTarget.h" @@ -72,12 +74,14 @@ inline void registerUpstreamDialects(DialectRegistry ®istry) { cf::ControlFlowDialect, DLTIDialect, gpu::GPUDialect, + index::IndexDialect, func::FuncDialect, LLVM::LLVMDialect, linalg::LinalgDialect, math::MathDialect, memref::MemRefDialect, scf::SCFDialect, + ptr::PtrDialect, vector::VectorDialect, ROCDL::ROCDLDialect, tensor::TensorDialect, @@ -97,6 +101,7 @@ inline void registerUpstreamDialects(DialectRegistry ®istry) { registerConvertFuncToLLVMInterface(registry); registerConvertMathToLLVMInterface(registry); registerConvertMemRefToLLVMInterface(registry); + index::registerConvertIndexToLLVMInterface(registry); ub::registerConvertUBToLLVMInterface(registry); // Register all external models. diff --git a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp index adfaff9141ed..781aec5e03bb 100644 --- a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp +++ b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp @@ -22,9 +22,11 @@ #include "mlir/Dialect/Rock/Pipelines/Pipelines.h" #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h" +#include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h" #include "mlir/Conversion/EmulateFp8ExtTrunc/EmulateFp8ExtTrunc.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" #include "mlir/Conversion/Passes.h" +#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h" #include "mlir/Conversion/RockToGPU/RockToGPU.h" #include "mlir/Dialect/AMDGPU/Transforms/Passes.h" #include "mlir/Dialect/Affine/Passes.h" @@ -236,23 +238,30 @@ void rock::buildBackendPipeline(OpPassManager &pm, // We need to lower affine again, because the expand strided metadata pass // adds back affine.apply for memref.subview gpuPm.addPass(createLowerAffinePass()); - gpuPm.addPass(createLowerGpuOpsToROCDLOpsPass( - options.chip, /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout, + GpuROCDLAttachTargetOptions opts; + opts.triple = options.triple; + opts.chip = options.chip; + opts.features = options.features; + opts.optLevel = options.optLevel; + pm.addPass(createGpuROCDLAttachTarget(opts)); + auto &gpuPm2 = pm.nest(); + gpuPm2.addPass(createLowerGpuOpsToROCDLOpsPass( + /*chipset=*/"infer", /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout, /*useBarePtrCallConv=*/true, gpu::amd::Runtime::HIP)); // Ensure we only run passes on LLVM functions inside GPU modules. - auto &llvmFuncPm = gpuPm.nest(); + auto &llvmFuncPm = gpuPm2.nest(); // -canonicalize -cse so that we don't have to crawl through memref // descriptors. (Mainly we want the `extractvalue` fold). llvmFuncPm.addPass(createCanonicalizerPass()); llvmFuncPm.addPass(createCSEPass()); llvmFuncPm.addPass(rock::createRockPrepareLLVMPass()); + gpuPm2.addPass( + createConvertToLLVMPass(kDeriveIndexBitwidthFromDataLayout, true)); + auto &llvmFuncPm2 = gpuPm2.nest(); + llvmFuncPm2.addPass(createCanonicalizerPass()); + llvmFuncPm2.addPass(createCSEPass()); + pm.addPass(createReconcileUnrealizedCastsPass()); if (options.compile) { - GpuROCDLAttachTargetOptions opts; - opts.triple = options.triple; - opts.chip = options.chip; - opts.features = options.features; - opts.optLevel = options.optLevel; - pm.addPass(createGpuROCDLAttachTarget(opts)); pm.addPass(createGpuModuleToBinaryPass()); pm.addPass(createRockCheckResidencyPass()); } diff --git a/mlir/lib/Dialect/Rock/utility/AmdArchDb.cpp b/mlir/lib/Dialect/Rock/utility/AmdArchDb.cpp index 6e876e296507..b87bac772f14 100644 --- a/mlir/lib/Dialect/Rock/utility/AmdArchDb.cpp +++ b/mlir/lib/Dialect/Rock/utility/AmdArchDb.cpp @@ -58,7 +58,7 @@ static constexpr AmdArchInfo GemmFeatures::atomic_fmax_f32 | GemmFeatures::wmma, /*waveSize=*/32, /*maxWavesPerEU*/ 20, /*totalSGPRPerEU*/ 512, /*totalVGPRPerEU*/ 1536, /*totalSharedMemPerCU*/ 131072, - /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/48, + /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/12, /*hasFp8ConversionInstrs=*/false, /*maxNumXCC=*/1); AmdArchInfo mlir::rock::lookupArchInfo(StringRef arch) { diff --git a/mlir/lib/Translation/GpuModuleToRocdlir/CMakeLists.txt b/mlir/lib/Translation/GpuModuleToRocdlir/CMakeLists.txt index f95c66b88f26..fcc71c5d6be2 100644 --- a/mlir/lib/Translation/GpuModuleToRocdlir/CMakeLists.txt +++ b/mlir/lib/Translation/GpuModuleToRocdlir/CMakeLists.txt @@ -16,6 +16,7 @@ add_mlir_translation_library(GpuModuleToRocdlirTranslation MLIRIR MLIRLLVMDialect MLIRROCDLDialect + MLIRPtrDialect MLIRSupport MLIRTargetLLVMIRExport ) diff --git a/mlir/lib/Translation/GpuModuleToRocdlir/GpuModuleToRocdlir.cpp b/mlir/lib/Translation/GpuModuleToRocdlir/GpuModuleToRocdlir.cpp index 8933c5df35bb..c7499c6cd02d 100644 --- a/mlir/lib/Translation/GpuModuleToRocdlir/GpuModuleToRocdlir.cpp +++ b/mlir/lib/Translation/GpuModuleToRocdlir/GpuModuleToRocdlir.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Ptr/IR/PtrDialect.h" #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" #include "mlir/Translation/GpuModuleToRocdir.h" @@ -56,7 +57,7 @@ void mlir::rock::registerGpuModuleToROCDLIRTranslation() { return success(); }, [](DialectRegistry ®istry) { - registry.insert(); + registry.insert(); mlir::registerGPUDialectTranslation(registry); mlir::registerROCDLDialectTranslation(registry); mlir::registerLLVMDialectTranslation(registry); diff --git a/mlir/test/rocmlir-driver/pipelines.mlir b/mlir/test/rocmlir-driver/pipelines.mlir index 192932a1624f..f0697f229379 100644 --- a/mlir/test/rocmlir-driver/pipelines.mlir +++ b/mlir/test/rocmlir-driver/pipelines.mlir @@ -50,12 +50,16 @@ // BINARY-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx90a saturate-fp8-truncf=true}, // BINARY-NEXT:emulate-fp8-ext-trunc, // BINARY-NEXT:expand-strided-metadata, -// BINARY-NEXT:lower-affine, -// BINARY-NEXT:convert-gpu-to-rocdl{chipset=gfx90a index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, +// BINARY-NEXT:lower-affine), +// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true}, +// BINARY-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, // BINARY-NEXT:llvm.func(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, // BINARY-NEXT:cse, -// BINARY-NEXT:rock-prepare-llvm)), -// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true}, +// BINARY-NEXT:rock-prepare-llvm), +// BINARY-NEXT:convert-to-llvm{ index-bitwidth=0 use-bare-ptr-call-conv=true}, +// BINARY-NEXT:llvm.func(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, +// BINARY-NEXT:cse)), +// BINARY-NEXT:reconcile-unrealized-casts, // BINARY-NEXT:gpu-module-to-binary{format=fatbin opts= toolkit=}, // BINARY-NEXT:rock-check-residency, // BINARY-NEXT:emulate-fp8-ext-trunc) @@ -69,12 +73,16 @@ // BINARY_MI300-NEXT:f8E5M2} target-type=f32}, // BINARY_MI300-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx940 saturate-fp8-truncf=true}, // BINARY_MI300-NEXT:expand-strided-metadata, -// BINARY_MI300-NEXT:lower-affine, -// BINARY_MI300-NEXT:convert-gpu-to-rocdl{chipset=gfx940 index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, +// BINARY_MI300-NEXT:lower-affine), +// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true}, +// BINARY_MI300-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, // BINARY_MI300-NEXT:llvm.func(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, // BINARY_MI300-NEXT:cse, -// BINARY_MI300-NEXT:rock-prepare-llvm)), -// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true}, +// BINARY_MI300-NEXT:rock-prepare-llvm), +// BINARY_MI300-NEXT:convert-to-llvm{ index-bitwidth=0 use-bare-ptr-call-conv=true}, +// BINARY_MI300-NEXT:llvm.func(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, +// BINARY_MI300-NEXT:cse)), +// BINARY_MI300-NEXT:reconcile-unrealized-casts, // BINARY_MI300-NEXT:gpu-module-to-binary{format=fatbin opts= toolkit=}, // BINARY_MI300-NEXT:rock-check-residency, // BINARY_MI300-NEXT:emulate-fp8-ext-trunc)