diff --git a/external/llvm-project/mlir/include/mlir/Conversion/ConvertToLLVM/ToLLVMPass.h b/external/llvm-project/mlir/include/mlir/Conversion/ConvertToLLVM/ToLLVMPass.h
index 73deef49c417..eb6729d43e21 100644
--- a/external/llvm-project/mlir/include/mlir/Conversion/ConvertToLLVM/ToLLVMPass.h
+++ b/external/llvm-project/mlir/include/mlir/Conversion/ConvertToLLVM/ToLLVMPass.h
@@ -21,6 +21,8 @@ namespace mlir {
 /// Create a pass that performs dialect conversion to LLVM  for all dialects
 /// implementing `ConvertToLLVMPatternInterface`.
 std::unique_ptr<Pass> createConvertToLLVMPass();
+std::unique_ptr<Pass> createConvertToLLVMPass(unsigned indexBitwidth,
+                                              bool useBarePtrCallConv);
 
 /// Register the extension that will load dependent dialects for LLVM
 /// conversion. This is useful to implement a pass similar to "convert-to-llvm".
diff --git a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 564778771299..f1233ad894da 100644
--- a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -42,7 +42,7 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target);
 /// is configurable.
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
 createLowerGpuOpsToROCDLOpsPass(
-    const std::string &chipset = "gfx900",
+    const std::string &chipset = "infer",
     unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
     bool useBarePtrCallConv = false,
     gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown);
diff --git a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
index e52c7ff6bd56..82a23fa3da5d 100644
--- a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
+++ b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
@@ -28,6 +28,11 @@ def ConvertToLLVMPass : Pass<"convert-to-llvm"> {
   let options = [
     ListOption<"filterDialects", "filter-dialects", "std::string",
                "Test conversion patterns of only the specified dialects">,
+    Option<"useBarePtrCallConv", "use-bare-ptr-call-conv", "bool", "false", "Whether memrefs can be converted to bare ptr">,
+    Option<"indexBitwidth", "index-bitwidth", "unsigned",
+           /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
+           "Bitwidth of the index type, 0 to use size of machine word">,
+
   ];
 }
 
@@ -589,11 +594,12 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
     "ROCDL::ROCDLDialect",
     "cf::ControlFlowDialect",
     "memref::MemRefDialect",
+    "ptr::PtrDialect",
   ];
   let options = [
     Option<"chipset", "chipset", "std::string",
-           /*default=*/"\"gfx000\"",
-           "Chipset that these operations will run on">,
+           /*default=*/"\"infer\"",
+           "Chipset that these operations will run on. By default it will infer target from attached Target Attribute on GPU Module">,
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
            "Bitwidth of the index type, 0 to use size of machine word">,
diff --git a/external/llvm-project/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td b/external/llvm-project/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td
index e75038f300f1..9ebf37e83041 100644
--- a/external/llvm-project/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td
+++ b/external/llvm-project/mlir/include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td
@@ -35,6 +35,8 @@ def Ptr_SpecAttr : Ptr_Attr<"Spec", "spec"> {
      - [Optional] index: bitwidth that should be used when performing index
      computations for the type. Setting the field to `kOptionalSpecValue`, means
      the field is optional.
+     - [Optional] llvmAddressSpace : Mapping from AddressSpace of ptr.PtrType's adddress Space to LLVM's address space. 
+     Setting the field to 'kOptionalLLVMAddressSpaceValue`, means the field is optional.
     
     Furthermore, the attribute will verify that all present values are divisible
     by 8 (number of bits in a byte), and that `preferred` > `abi`.
@@ -43,26 +45,28 @@ def Ptr_SpecAttr : Ptr_Attr<"Spec", "spec"> {
     ```mlir
     // Spec for a 64 bit ptr, with a required alignment of 64 bits, but with
     // a preferred alignment of 128 bits and an index bitwidth of 64 bits.
-    #ptr.spec<size = 64, abi = 64, preferred = 128, index = 64>
+    #ptr.spec<size = 64, abi = 64, preferred = 128, index = 64, llvmAddressSpace = 0>
     ```
   }];
   let parameters = (ins
     "uint32_t":$size,
     "uint32_t":$abi,
     "uint32_t":$preferred,
-    DefaultValuedParameter<"uint32_t", "kOptionalSpecValue">:$index
+    DefaultValuedParameter<"uint32_t", "kOptionalSpecValue">:$index,
+    DefaultValuedParameter<"uint32_t", "kOptionalLLVMAddressSpaceValue">:$llvmAddressSpace
   );
   let skipDefaultBuilders = 1;
   let builders = [
     AttrBuilder<(ins "uint32_t":$size, "uint32_t":$abi, "uint32_t":$preferred,
-                     CArg<"uint32_t", "kOptionalSpecValue">:$index), [{
-      return $_get($_ctxt, size, abi, preferred, index);
+                     CArg<"uint32_t", "kOptionalSpecValue">:$index, CArg<"uint32_t", "kOptionalLLVMAddressSpaceValue">:$llvmAddressSpace), [{
+      return $_get($_ctxt, size, abi, preferred, index, llvmAddressSpace);
     }]>
   ];
   let assemblyFormat = "`<` struct(params) `>`";
   let extraClassDeclaration = [{
     /// Constant for specifying a spec entry is optional.
     static constexpr uint32_t kOptionalSpecValue = std::numeric_limits<uint32_t>::max();
+    static constexpr uint32_t kOptionalLLVMAddressSpaceValue = 0;
   }];
   let genVerifyDecl = 1;
 }
diff --git a/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt b/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt
index df7e3f995303..b06d9e03a80c 100644
--- a/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt
+++ b/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/CMakeLists.txt
@@ -24,6 +24,7 @@ add_mlir_conversion_library(MLIRConvertToLLVMPass
   MLIRIR
   MLIRLLVMCommonConversion
   MLIRLLVMDialect
+  MLIRPtrDialect
   MLIRPass
   MLIRRewrite
   MLIRSupport
diff --git a/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp b/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp
index b2407a258c27..1e334a057cd1 100644
--- a/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp
+++ b/external/llvm-project/mlir/lib/Conversion/ConvertToLLVM/ConvertToLLVMPass.cpp
@@ -6,12 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Analysis/DataLayoutAnalysis.h"
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Ptr/IR/PtrAttrs.h"
+#include "mlir/Dialect/Ptr/IR/PtrDialect.h"
+#include "mlir/Dialect/Ptr/IR/PtrTypes.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Rewrite/FrozenRewritePatternSet.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -69,16 +75,82 @@ class ConvertToLLVMPass
 
 public:
   using impl::ConvertToLLVMPassBase<ConvertToLLVMPass>::ConvertToLLVMPassBase;
+  ConvertToLLVMPass() = default;
+  ConvertToLLVMPass(unsigned indexBitwidth, bool useBarePtrCallConv) {
+    if (this->indexBitwidth.getNumOccurrences() == 0)
+      this->indexBitwidth = indexBitwidth;
+    if (this->useBarePtrCallConv.getNumOccurrences() == 0)
+      this->useBarePtrCallConv = useBarePtrCallConv;
+  }
+
   void getDependentDialects(DialectRegistry &registry) const final {
     registry.insert<LLVM::LLVMDialect>();
+    registry.insert<ptr::PtrDialect>();
     registry.addExtensions<LoadDependentDialectExtension>();
   }
 
-  LogicalResult initialize(MLIRContext *context) final {
+  LogicalResult initialize(MLIRContext *context) final { return success(); }
+
+  void runOnOperation() final {
+    auto *op = getOperation();
+    auto *context = op->getContext();
+    StringRef dataLayout;
+    auto dataLayoutAttr = dyn_cast_or_null<StringAttr>(
+        op->getAttr(LLVM::LLVMDialect::getDataLayoutAttrName()));
+    if (dataLayoutAttr)
+      dataLayout = dataLayoutAttr.getValue();
+
+    if (failed(LLVM::LLVMDialect::verifyDataLayoutString(
+            dataLayout, [this](const Twine &message) {
+              getOperation()->emitError() << message.str();
+            }))) {
+      signalPassFailure();
+      return;
+    }
+
+    const DataLayoutAnalysis &dataLayoutAnalysis =
+        getAnalysis<DataLayoutAnalysis>();
+    LowerToLLVMOptions options(context,
+                               dataLayoutAnalysis.getAtOrAbove(op));
+    options.useBarePtrCallConv = useBarePtrCallConv;
+    if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
+      options.overrideIndexBitwidth(indexBitwidth);
+    options.dataLayout = llvm::DataLayout(dataLayout);
+    if (useBarePtrCallConv) {
+      options.useBarePtrCallConv = true;
+    }
+
     RewritePatternSet tempPatterns(context);
     auto target = std::make_shared<ConversionTarget>(*context);
     target->addLegalDialect<LLVM::LLVMDialect>();
-    auto typeConverter = std::make_shared<LLVMTypeConverter>(context);
+    auto typeConverter = std::make_shared<LLVMTypeConverter>(context, options);
+
+    DenseMap<Attribute, uint64_t> addressSpaceMap;
+    if (DataLayoutOpInterface iface = dyn_cast<DataLayoutOpInterface>(op)) {
+      if (DataLayoutSpecInterface dlSpec = iface.getDataLayoutSpec()) {
+        for (DataLayoutEntryInterface entry : dlSpec.getEntries()) {
+          ptr::PtrType ptrKey = llvm::dyn_cast_or_null<mlir::ptr::PtrType>(
+              entry.getKey().get<mlir::Type>());
+          if (!ptrKey) {
+            continue;
+          }
+          Attribute addressSpace = ptrKey.getMemorySpace();
+          auto value =
+              cast<mlir::ptr::SpecAttr>(entry.getValue()).getLlvmAddressSpace();
+          addressSpaceMap.insert({addressSpace, value});
+        }
+      }
+      typeConverter->addTypeAttributeConversion(
+          [addressSpaceMap](BaseMemRefType type, Attribute memorySpaceAttr) {
+            unsigned llvmAddressSpace = 0;
+            if (addressSpaceMap.contains(memorySpaceAttr)) {
+              llvmAddressSpace = addressSpaceMap.at(memorySpaceAttr);
+            }
+            return IntegerAttr::get(
+                IntegerType::get(memorySpaceAttr.getContext(), 64),
+                llvmAddressSpace);
+          });
+    }
 
     if (!filterDialects.empty()) {
       // Test mode: Populate only patterns from the specified dialects. Produce
@@ -86,14 +158,19 @@ class ConvertToLLVMPass
       // interface.
       for (std::string &dialectName : filterDialects) {
         Dialect *dialect = context->getLoadedDialect(dialectName);
-        if (!dialect)
-          return emitError(UnknownLoc::get(context))
-                 << "dialect not loaded: " << dialectName << "\n";
+        if (!dialect) {
+          emitError(UnknownLoc::get(context))
+              << "dialect not loaded: " << dialectName << "\n";
+          signalPassFailure();
+        }
         auto *iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
-        if (!iface)
-          return emitError(UnknownLoc::get(context))
-                 << "dialect does not implement ConvertToLLVMPatternInterface: "
-                 << dialectName << "\n";
+        if (!iface) {
+          emitError(UnknownLoc::get(context))
+              << "dialect does not implement ConvertToLLVMPatternInterface: "
+              << dialectName << "\n";
+          signalPassFailure();
+        }
+
         iface->populateConvertToLLVMConversionPatterns(*target, *typeConverter,
                                                        tempPatterns);
       }
@@ -110,15 +187,10 @@ class ConvertToLLVMPass
                                                        tempPatterns);
       }
     }
-
     this->patterns =
         std::make_unique<FrozenRewritePatternSet>(std::move(tempPatterns));
     this->target = target;
     this->typeConverter = typeConverter;
-    return success();
-  }
-
-  void runOnOperation() final {
     if (failed(applyPartialConversion(getOperation(), *target, *patterns)))
       signalPassFailure();
   }
@@ -134,3 +206,8 @@ void mlir::registerConvertToLLVMDependentDialectLoading(
 std::unique_ptr<Pass> mlir::createConvertToLLVMPass() {
   return std::make_unique<ConvertToLLVMPass>();
 }
+
+std::unique_ptr<Pass> mlir::createConvertToLLVMPass(unsigned indexBitwidth,
+                                                    bool useBarePtrCallConv) {
+  return std::make_unique<ConvertToLLVMPass>(indexBitwidth, useBarePtrCallConv);
+}
diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
index 945e3ccdfa87..e8c4e9589142 100644
--- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
+++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -20,6 +20,7 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms
   MLIRGPUToGPURuntimeTransforms
   MLIRLLVMCommonConversion
   MLIRLLVMDialect
+  MLIRPtrDialect
   MLIRMemRefToLLVM
   MLIRROCDLDialect
   MLIRPass
diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 36fbf80c8156..e2280d60078a 100644
--- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -14,21 +14,21 @@
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/Ptr/IR/PtrAttrs.h"
+#include "mlir/Dialect/Ptr/IR/PtrDialect.h"
+#include "mlir/Dialect/Ptr/IR/PtrTypes.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
 
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
-#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
-#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MathToROCDL/MathToROCDL.h"
-#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -36,18 +36,14 @@
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
-#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "llvm/Support/FormatVariadic.h"
 
 #include "../GPUCommon/GPUOpsLowering.h"
 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
-#include "../GPUCommon/OpToFuncCallLowering.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
@@ -55,7 +51,6 @@ namespace mlir {
 } // namespace mlir
 
 #include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
-
 using namespace mlir;
 
 /// Returns true if the given `gpu.func` can be safely called using the bare
@@ -219,6 +214,28 @@ struct LowerGpuOpsToROCDLOpsPass
   void runOnOperation() override {
     gpu::GPUModuleOp m = getOperation();
     MLIRContext *ctx = m.getContext();
+    OpBuilder b(ctx);
+    ArrayAttr targets = m.getTargetsAttr();
+    if (chipset == "infer") {
+      if (!targets) {
+        emitError(UnknownLoc::get(ctx),
+                  "ROCDLTargetAttr is empty on GPU module");
+        return signalPassFailure();
+      }
+      if (targets.size() != 1) {
+        emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified "
+                                        "more than one gpu-arch on GPU module");
+        return signalPassFailure();
+      }
+      const ROCDL::ROCDLTargetAttr targetAttr =
+          mlir::dyn_cast<ROCDL::ROCDLTargetAttr>(targets.getValue().front());
+      chipset = targetAttr.getChip().str();
+    }
+    FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
+    if (failed(maybeChipset)) {
+      emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
+      return signalPassFailure();
+    }
 
     auto llvmDataLayout = m->getAttrOfType<StringAttr>(
         LLVM::LLVMDialect::getDataLayoutAttrName());
@@ -232,12 +249,6 @@ struct LowerGpuOpsToROCDLOpsPass
                     UnitAttr::get(ctx));
     }
 
-    FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
-    if (failed(maybeChipset)) {
-      emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
-      return signalPassFailure();
-    }
-
     /// Customize the bitwidth used for the device side index computations.
     LowerToLLVMOptions options(
         ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
@@ -288,15 +299,35 @@ struct LowerGpuOpsToROCDLOpsPass
 
     RewritePatternSet llvmPatterns(ctx);
 
-    mlir::arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);
     populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
                                             *maybeChipset);
     populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
-    populateMathToLLVMConversionPatterns(converter, llvmPatterns);
     cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
-    populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
-    populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns);
     populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);
+    // ABI, PreferredAlignment, Size etc are set arbitarily here for now for the
+    // SpecAttr, SpecAttr is used for mapping to LLVM addressSpace
+    DataLayoutEntryInterface ptrProgramMemoryAttr = DataLayoutEntryAttr::get(
+        b.getType<mlir::ptr::PtrType>(
+            b.getAttr<gpu::AddressSpaceAttr>(gpu::AddressSpace::Workgroup)),
+        b.getAttr<mlir::ptr::SpecAttr>(32, 32, 32, 32, 3));
+    DataLayoutEntryInterface ptrGlobalMemoryAttr = DataLayoutEntryAttr::get(
+        b.getType<mlir::ptr::PtrType>(
+            b.getAttr<gpu::AddressSpaceAttr>(gpu::AddressSpace::Global)),
+        b.getAttr<mlir::ptr::SpecAttr>(32, 32, 32, 32, 1));
+    DataLayoutEntryInterface ptrAllocaMemoryAttr = DataLayoutEntryAttr::get(
+        b.getType<mlir::ptr::PtrType>(
+            b.getAttr<gpu::AddressSpaceAttr>(gpu::AddressSpace::Private)),
+        b.getAttr<mlir::ptr::SpecAttr>(32, 32, 32, 32, 5));
+
+    llvm::ArrayRef dltiAddressSpaceAttrs = {
+        ptrAllocaMemoryAttr, ptrGlobalMemoryAttr, ptrProgramMemoryAttr};
+    DataLayoutSpecAttr dltiSpec =
+        b.getAttr<DataLayoutSpecAttr>(dltiAddressSpaceAttrs);
+    if (auto previousDltiSpec = m.getDataLayoutSpec()) {
+      dltiSpec = dltiSpec.combineWith(previousDltiSpec);
+    }
+    m->setAttr(DLTIDialect::kDataLayoutAttrName, dltiSpec);
+
     LLVMConversionTarget target(getContext());
     configureGpuToROCDLConversionLegality(target);
     if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
@@ -337,22 +368,12 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
                       LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
   // These ops are legal for f32 type.
   target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
-    return any_of(op->getOperandTypes(),
-                  llvm::IsaPred<Float32Type>);
+    return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
   });
   // TODO: Remove once we support replacing non-root ops.
   target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
 }
 
-// namespace mlir
-template <typename OpTy>
-static void populateOpPatterns(LLVMTypeConverter &converter,
-                               RewritePatternSet &patterns, StringRef f32Func,
-                               StringRef f64Func, StringRef f16Func) {
-  patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
-  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func, f16Func);
-}
-
 void mlir::populateGpuToROCDLConversionPatterns(
     LLVMTypeConverter &converter, RewritePatternSet &patterns,
     mlir::gpu::amd::Runtime runtime) {
diff --git a/external/llvm-project/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp b/external/llvm-project/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp
index f8ce820d0bcb..9474e2f65e2a 100644
--- a/external/llvm-project/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp
+++ b/external/llvm-project/mlir/lib/Dialect/Ptr/IR/PtrAttrs.cpp
@@ -24,7 +24,7 @@ constexpr const static unsigned kBitsInByte = 8;
 
 LogicalResult SpecAttr::verify(function_ref<InFlightDiagnostic()> emitError,
                                uint32_t size, uint32_t abi, uint32_t preferred,
-                               uint32_t index) {
+                               uint32_t index, uint32_t llvmAddressSpace) {
   if (size % kBitsInByte != 0)
     return emitError() << "size entry must be divisible by 8";
   if (abi % kBitsInByte != 0)
diff --git a/mlir/include/mlir/InitRocMLIRDialects.h b/mlir/include/mlir/InitRocMLIRDialects.h
index b4d822b7bb33..2a4727b0f2e5 100644
--- a/mlir/include/mlir/InitRocMLIRDialects.h
+++ b/mlir/include/mlir/InitRocMLIRDialects.h
@@ -15,7 +15,9 @@
 #define MLIR_INITROCMLIRDIALECTS_H_
 
 // rocMLIR includes
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/MIGraphX/IR/MIGraphX.h"
+#include "mlir/Dialect/Ptr/IR/PtrDialect.h"
 #include "mlir/Dialect/Rock/IR/Rock.h"
 #include "mlir/Dialect/Rock/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/InitRocMLIRTarget.h"
@@ -72,12 +74,14 @@ inline void registerUpstreamDialects(DialectRegistry &registry) {
                   cf::ControlFlowDialect,
                   DLTIDialect,
                   gpu::GPUDialect,
+                  index::IndexDialect,
                   func::FuncDialect,
                   LLVM::LLVMDialect,
                   linalg::LinalgDialect,
                   math::MathDialect,
                   memref::MemRefDialect,
                   scf::SCFDialect,
+                  ptr::PtrDialect,
                   vector::VectorDialect,
                   ROCDL::ROCDLDialect,
                   tensor::TensorDialect,
@@ -97,6 +101,7 @@ inline void registerUpstreamDialects(DialectRegistry &registry) {
   registerConvertFuncToLLVMInterface(registry);
   registerConvertMathToLLVMInterface(registry);
   registerConvertMemRefToLLVMInterface(registry);
+  index::registerConvertIndexToLLVMInterface(registry);
   ub::registerConvertUBToLLVMInterface(registry);
 
   // Register all external models.
diff --git a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
index adfaff9141ed..781aec5e03bb 100644
--- a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
+++ b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
@@ -22,9 +22,11 @@
 
 #include "mlir/Dialect/Rock/Pipelines/Pipelines.h"
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
+#include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h"
 #include "mlir/Conversion/EmulateFp8ExtTrunc/EmulateFp8ExtTrunc.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/Passes.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/RockToGPU/RockToGPU.h"
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
 #include "mlir/Dialect/Affine/Passes.h"
@@ -236,23 +238,30 @@ void rock::buildBackendPipeline(OpPassManager &pm,
   // We need to lower affine again, because the expand strided metadata pass
   // adds back affine.apply for memref.subview
   gpuPm.addPass(createLowerAffinePass());
-  gpuPm.addPass(createLowerGpuOpsToROCDLOpsPass(
-      options.chip, /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout,
+  GpuROCDLAttachTargetOptions opts;
+  opts.triple = options.triple;
+  opts.chip = options.chip;
+  opts.features = options.features;
+  opts.optLevel = options.optLevel;
+  pm.addPass(createGpuROCDLAttachTarget(opts));
+  auto &gpuPm2 = pm.nest<gpu::GPUModuleOp>();
+  gpuPm2.addPass(createLowerGpuOpsToROCDLOpsPass(
+      /*chipset=*/"infer", /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout,
       /*useBarePtrCallConv=*/true, gpu::amd::Runtime::HIP));
   // Ensure we only run passes on LLVM functions inside GPU modules.
-  auto &llvmFuncPm = gpuPm.nest<LLVM::LLVMFuncOp>();
+  auto &llvmFuncPm = gpuPm2.nest<LLVM::LLVMFuncOp>();
   // -canonicalize -cse so that we don't have to crawl through memref
   // descriptors. (Mainly we want the `extractvalue` fold).
   llvmFuncPm.addPass(createCanonicalizerPass());
   llvmFuncPm.addPass(createCSEPass());
   llvmFuncPm.addPass(rock::createRockPrepareLLVMPass());
+  gpuPm2.addPass(
+      createConvertToLLVMPass(kDeriveIndexBitwidthFromDataLayout, true));
+  auto &llvmFuncPm2 = gpuPm2.nest<LLVM::LLVMFuncOp>();
+  llvmFuncPm2.addPass(createCanonicalizerPass());
+  llvmFuncPm2.addPass(createCSEPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
   if (options.compile) {
-    GpuROCDLAttachTargetOptions opts;
-    opts.triple = options.triple;
-    opts.chip = options.chip;
-    opts.features = options.features;
-    opts.optLevel = options.optLevel;
-    pm.addPass(createGpuROCDLAttachTarget(opts));
     pm.addPass(createGpuModuleToBinaryPass());
     pm.addPass(createRockCheckResidencyPass());
   }
diff --git a/mlir/lib/Dialect/Rock/utility/AmdArchDb.cpp b/mlir/lib/Dialect/Rock/utility/AmdArchDb.cpp
index 6e876e296507..b87bac772f14 100644
--- a/mlir/lib/Dialect/Rock/utility/AmdArchDb.cpp
+++ b/mlir/lib/Dialect/Rock/utility/AmdArchDb.cpp
@@ -58,7 +58,7 @@ static constexpr AmdArchInfo
                   GemmFeatures::atomic_fmax_f32 | GemmFeatures::wmma,
               /*waveSize=*/32, /*maxWavesPerEU*/ 20, /*totalSGPRPerEU*/ 512,
               /*totalVGPRPerEU*/ 1536, /*totalSharedMemPerCU*/ 131072,
-              /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/48,
+              /*maxSharedMemPerWG*/ 65536, /*numEUPerCU=*/4, /*minNumCU=*/12,
               /*hasFp8ConversionInstrs=*/false, /*maxNumXCC=*/1);
 
 AmdArchInfo mlir::rock::lookupArchInfo(StringRef arch) {
diff --git a/mlir/lib/Translation/GpuModuleToRocdlir/CMakeLists.txt b/mlir/lib/Translation/GpuModuleToRocdlir/CMakeLists.txt
index f95c66b88f26..fcc71c5d6be2 100644
--- a/mlir/lib/Translation/GpuModuleToRocdlir/CMakeLists.txt
+++ b/mlir/lib/Translation/GpuModuleToRocdlir/CMakeLists.txt
@@ -16,6 +16,7 @@ add_mlir_translation_library(GpuModuleToRocdlirTranslation
   MLIRIR
   MLIRLLVMDialect
   MLIRROCDLDialect
+  MLIRPtrDialect
   MLIRSupport
   MLIRTargetLLVMIRExport
 )
diff --git a/mlir/lib/Translation/GpuModuleToRocdlir/GpuModuleToRocdlir.cpp b/mlir/lib/Translation/GpuModuleToRocdlir/GpuModuleToRocdlir.cpp
index 8933c5df35bb..c7499c6cd02d 100644
--- a/mlir/lib/Translation/GpuModuleToRocdlir/GpuModuleToRocdlir.cpp
+++ b/mlir/lib/Translation/GpuModuleToRocdlir/GpuModuleToRocdlir.cpp
@@ -14,6 +14,7 @@
 
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Ptr/IR/PtrDialect.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Translation/GpuModuleToRocdir.h"
 
@@ -56,7 +57,7 @@ void mlir::rock::registerGpuModuleToROCDLIRTranslation() {
         return success();
       },
       [](DialectRegistry &registry) {
-        registry.insert<mlir::gpu::GPUDialect, mlir::DLTIDialect>();
+        registry.insert<mlir::gpu::GPUDialect, mlir::DLTIDialect, ptr::PtrDialect>();
         mlir::registerGPUDialectTranslation(registry);
         mlir::registerROCDLDialectTranslation(registry);
         mlir::registerLLVMDialectTranslation(registry);
diff --git a/mlir/test/rocmlir-driver/pipelines.mlir b/mlir/test/rocmlir-driver/pipelines.mlir
index 192932a1624f..f0697f229379 100644
--- a/mlir/test/rocmlir-driver/pipelines.mlir
+++ b/mlir/test/rocmlir-driver/pipelines.mlir
@@ -50,12 +50,16 @@
 // BINARY-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx90a saturate-fp8-truncf=true},
 // BINARY-NEXT:emulate-fp8-ext-trunc,
 // BINARY-NEXT:expand-strided-metadata,
-// BINARY-NEXT:lower-affine,
-// BINARY-NEXT:convert-gpu-to-rocdl{chipset=gfx90a index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
+// BINARY-NEXT:lower-affine),
+// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
+// BINARY-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
 // BINARY-NEXT:llvm.func(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
 // BINARY-NEXT:cse,
-// BINARY-NEXT:rock-prepare-llvm)),
-// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
+// BINARY-NEXT:rock-prepare-llvm),
+// BINARY-NEXT:convert-to-llvm{ index-bitwidth=0 use-bare-ptr-call-conv=true},
+// BINARY-NEXT:llvm.func(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
+// BINARY-NEXT:cse)),
+// BINARY-NEXT:reconcile-unrealized-casts,
 // BINARY-NEXT:gpu-module-to-binary{format=fatbin  opts= toolkit=},
 // BINARY-NEXT:rock-check-residency,
 // BINARY-NEXT:emulate-fp8-ext-trunc)
@@ -69,12 +73,16 @@
 // BINARY_MI300-NEXT:f8E5M2} target-type=f32},
 // BINARY_MI300-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx940 saturate-fp8-truncf=true},
 // BINARY_MI300-NEXT:expand-strided-metadata,
-// BINARY_MI300-NEXT:lower-affine,
-// BINARY_MI300-NEXT:convert-gpu-to-rocdl{chipset=gfx940 index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
+// BINARY_MI300-NEXT:lower-affine),
+// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
+// BINARY_MI300-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
 // BINARY_MI300-NEXT:llvm.func(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
 // BINARY_MI300-NEXT:cse,
-// BINARY_MI300-NEXT:rock-prepare-llvm)),
-// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
+// BINARY_MI300-NEXT:rock-prepare-llvm),
+// BINARY_MI300-NEXT:convert-to-llvm{ index-bitwidth=0 use-bare-ptr-call-conv=true},
+// BINARY_MI300-NEXT:llvm.func(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
+// BINARY_MI300-NEXT:cse)),
+// BINARY_MI300-NEXT:reconcile-unrealized-casts,
 // BINARY_MI300-NEXT:gpu-module-to-binary{format=fatbin  opts= toolkit=},
 // BINARY_MI300-NEXT:rock-check-residency,
 // BINARY_MI300-NEXT:emulate-fp8-ext-trunc)