diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h
index dc3e71ff986..b400636a090 100644
--- a/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h
+++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_forwards.h
@@ -59,6 +59,8 @@ using ncclComm_t = struct ncclComm *;
 
 // Forward declaration of hipFFT types.
 using hipfftHandle = struct hipfftHandle_t *;
+// Forward declaration of  hiprtcProgram
+using hiprtcProgram = struct _hiprtcProgram *;
 // Enums for corresponding #defines in the hipFFT headers.
 enum hipfftDirection_t : int {
   HIPFFT_FORWARD = -1,
diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h
index 3908eed5205..646aa532827 100644
--- a/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h
+++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_stub.h
@@ -52,6 +52,29 @@ extern "C" {
 
 const char* hipGetErrorName(hipError_t hip_error);
 const char* hipGetErrorString(hipError_t hip_error);
+const char *hiprtcGetErrorString(hiprtcResult result);
+hiprtcResult hiprtcVersion(int* major, int* minor);
+hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression);
+hiprtcResult hiprtcCompileProgram(
+                                   hiprtcProgram prog,
+                                   int numOptions,
+                                   const char** options);
+hiprtcResult hiprtcCreateProgram(
+                                  hiprtcProgram* prog,
+                                  const char* src,
+                                  const char* name,
+                                  int numberHeaders,
+                                  char** headers,
+                                  const char** includeNames);
+hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog);
+hiprtcResult hiprtcGetLoweredName(
+                                  hiprtcProgram prog,
+                                  const char* name_expression,
+                                  const char** lowered_name);
+hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log);
+hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet);
+hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code);
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet);
 
 // Enums for corresponding #defines in the HIP headers.
 enum hipDeviceFlags_t {
diff --git a/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h b/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h
index 98a3f0bc0fd..92feac7d949 100644
--- a/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h
+++ b/backends/gpu/include/tfrt/gpu/wrapper/hip_wrapper.h
@@ -29,6 +29,7 @@ namespace gpu {
 namespace wrapper {
 
 raw_ostream& Print(raw_ostream& os, hipError_t error);
+raw_ostream& Print(raw_ostream& os, hiprtcResult result);
 
 namespace internal {
 template <>
@@ -162,6 +163,8 @@ llvm::Error HipMemsetD32Async(CurrentContext current, Pointer<void> dst,
 
 llvm::Expected<OwningModule> HipModuleLoadData(CurrentContext current,
                                                const void* image);
+llvm::Expected<OwningModule> HipRTCModuleLoadData(CurrentContext current,
+                                                  const void* image);
 llvm::Expected<OwningModule> HipModuleLoadDataEx(
     CurrentContext current, const void* image,
     llvm::ArrayRef<hipJitOption> options, llvm::ArrayRef<void*> option_values);
diff --git a/backends/gpu/lib/kernels/driver_kernels.cc b/backends/gpu/lib/kernels/driver_kernels.cc
index 6feb245ceff..8a5d40c204e 100644
--- a/backends/gpu/lib/kernels/driver_kernels.cc
+++ b/backends/gpu/lib/kernels/driver_kernels.cc
@@ -320,7 +320,6 @@ static Expected<GpuModule> GpuModuleLoad(Argument<GpuContext> context,
                             MakeStringError("GPU JIT error log: ", error_log));
   }
 #endif
-
   return GpuModule(context.ValueRef(), std::move(*module));
 }
 
diff --git a/backends/gpu/lib/wrapper/driver_wrapper.cc b/backends/gpu/lib/wrapper/driver_wrapper.cc
index ef4574980da..7e11316d95a 100644
--- a/backends/gpu/lib/wrapper/driver_wrapper.cc
+++ b/backends/gpu/lib/wrapper/driver_wrapper.cc
@@ -852,7 +852,7 @@ llvm::Expected<OwningModule> ModuleLoadData(CurrentContext current,
     case Platform::CUDA:
       return CuModuleLoadData(current, image);
     case Platform::ROCm:
-      return HipModuleLoadData(current, image);
+      return HipRTCModuleLoadData(current, image);
     default:
       return InvalidPlatform(platform);
   }
diff --git a/backends/gpu/lib/wrapper/hip_stub.cc b/backends/gpu/lib/wrapper/hip_stub.cc
index 68a650d3bdf..539738d9339 100644
--- a/backends/gpu/lib/wrapper/hip_stub.cc
+++ b/backends/gpu/lib/wrapper/hip_stub.cc
@@ -61,3 +61,92 @@ const char *hipGetErrorString(hipError_t hip_error) {
   if (!func_ptr) return "FAILED_TO_LOAD_FUNCTION_SYMBOL";
   return func_ptr(hip_error);
 }
+
+const char *hiprtcGetErrorString(hiprtcResult result) {
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetErrorString", hiprtcGetErrorString);
+  if (!func_ptr) return "FAILED_TO_LOAD_FUNCTION_SYMBOL";
+  return func_ptr(result);
+}
+
+hiprtcResult hiprtcVersion(int* major, int* minor){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcVersion", hiprtcVersion);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(major, minor);
+}
+
+hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcAddNameExpression", hiprtcAddNameExpression);
+  if (!func_ptr) return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
+  return func_ptr(prog, name_expression);
+}
+
+hiprtcResult hiprtcCompileProgram(
+                                   hiprtcProgram prog,
+                                   int numOptions,
+                                   const char** options){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcCompileProgram", hiprtcCompileProgram);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, numOptions, options);
+}
+
+hiprtcResult hiprtcCreateProgram(
+                                  hiprtcProgram* prog,
+                                  const char* src,
+                                  const char* name,
+                                  int numberHeaders,
+                                  char** headers,
+                                  const char** includeNames){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcCreateProgram", hiprtcCreateProgram);
+  if (!func_ptr) return HIPRTC_ERROR_PROGRAM_CREATION_FAILURE;
+  return func_ptr(prog, src, name, numberHeaders, headers, includeNames);
+}
+
+hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcDestroyProgram", hiprtcDestroyProgram);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog);
+}
+
+hiprtcResult hiprtcGetLoweredName(
+                                  hiprtcProgram prog,
+                                  const char* name_expression,
+                                  const char** lowered_name){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetLoweredName", hiprtcGetLoweredName);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, name_expression, lowered_name);
+}
+
+hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetProgramLog", hiprtcGetProgramLog);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, log);
+}
+
+hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetProgramLogSize", hiprtcGetProgramLogSize);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, logSizeRet);
+}
+
+hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetCode", hiprtcGetCode);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, code);
+}
+
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet){
+  static auto func_ptr =
+      GetFunctionPointer("hiprtcGetCodeSize", hiprtcGetCodeSize);
+  if (!func_ptr) return HIPRTC_ERROR_INTERNAL_ERROR;
+  return func_ptr(prog, codeSizeRet);
+}
diff --git a/backends/gpu/lib/wrapper/hip_wrapper.cc b/backends/gpu/lib/wrapper/hip_wrapper.cc
index 052c06f1f75..cf37c4ef03e 100644
--- a/backends/gpu/lib/wrapper/hip_wrapper.cc
+++ b/backends/gpu/lib/wrapper/hip_wrapper.cc
@@ -37,6 +37,12 @@ llvm::raw_ostream& Print(llvm::raw_ostream& os, hipError_t error) {
   return os;
 }
 
+llvm::raw_ostream& Print(llvm::raw_ostream& os, hiprtcResult result) {
+  const char* msg = hiprtcGetErrorString(result);
+  if (msg != nullptr) os << "hiprtc Error: (" << msg << ")";
+  return os;
+}
+
 // Convert wrapper types to HIP types.
 static hipDevice_t ToRocm(Device device) { return device.id(Platform::ROCm); }
 
@@ -540,6 +546,45 @@ llvm::Expected<OwningModule> HipModuleLoadData(CurrentContext current,
   return OwningModule(module);
 }
 
+llvm::Expected<OwningModule> HipRTCModuleLoadData(CurrentContext current,
+                                                  const void* image) {
+  CheckHipContext(current);
+  hiprtcProgram prog;
+  //auto img = reinterpret_cast<const char*>(const_cast<void*>(image));
+  auto kernel = static_cast<const char*>(image);
+  std::string kname(kernel);
+  kname += ".cu";
+  RETURN_IF_ERROR(hiprtcCreateProgram(&prog,
+                                      kernel,
+                                      kname.c_str(),
+                                      0,
+                                      nullptr,
+                                      nullptr
+                                     ));
+  hiprtcResult compileResult = hiprtcCompileProgram(prog, 0, nullptr);
+  if (compileResult != HIPRTC_SUCCESS) {
+    size_t logSize;
+    hiprtcGetProgramLogSize(prog, &logSize);
+    if (logSize) {
+      std::string log(logSize, '\0');
+      hiprtcGetProgramLog(prog, &log[0]);
+      MakeStringError(log.c_str());
+    }
+  }
+
+  size_t code_size;
+  RETURN_IF_ERROR(hiprtcGetCodeSize(prog, &code_size));
+  std::vector<char> code(code_size);
+  RETURN_IF_ERROR(hiprtcGetCode(prog, code.data()));
+  RETURN_IF_ERROR(hiprtcDestroyProgram(&prog));
+
+  hipModule_t module;
+  RETURN_IF_ERROR(hipModuleLoadData(&module, code.data()));
+
+  NotifyResourceCreated(ResourceType::kModule, module);
+  return OwningModule(module);
+}
+
 llvm::Expected<OwningModule> HipModuleLoadDataEx(
     CurrentContext current, const void* image,
     llvm::ArrayRef<hipJitOption> options, llvm::ArrayRef<void*> option_values) {
diff --git a/backends/gpu/mlir_tests/rocm/blas.mlir b/backends/gpu/mlir_tests/rocm/blas.mlir
new file mode 100644
index 00000000000..fb2969165a6
--- /dev/null
+++ b/backends/gpu/mlir_tests/rocm/blas.mlir
@@ -0,0 +1,193 @@
+// Copyright 2020 The TensorFlow Runtime Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// RUN: bef_executor_lite %s.bef | FileCheck %s
+
+// CHECK-LABEL: --- Running 'blas_axpy'
+func @blas_axpy() {
+  %ch1 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+  %allocator = tfrt_gpu.allocator.create %context
+  %stream = tfrt_gpu.stream.create %context
+  %blas = tfrt_gpu.blas.create %context
+
+  %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats
+  %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes
+
+  %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64]
+  %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1
+
+  %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32]
+  %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3
+  %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32]
+  %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5
+  %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %stride = tfrt.constant.i32 1
+  %alpha = tfrt.constant.f32 1.0
+  %ch7 = tfrt_gpu.blas.axpy %blas, %stream, %buffer_length, %alpha, rocblas_datatype_f32_r, 
+         %gpu_buffer_0, rocblas_datatype_f32_r, %stride, %gpu_buffer_1, rocblas_datatype_f32_r, %stride,
+         rocblas_datatype_f32_r, %ch6
+
+  %ch8 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_1, %stream, %ch7 : !ht.host_buffer, !tfrt_gpu.buffer
+  %ch9 = tfrt_gpu.stream.synchronize %stream, %ch8
+  // CHECK: DenseHostTensor dtype = f32, shape = [2, 2]
+  // CHECK-SAME: values = [3.000000e+00, 5.000000e+00, 7.000000e+00, 9.000000e+00]
+  %ch10 = tfrt_dht.print_tensor %host_tensor, %ch9
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'blas_gemm'
+func @blas_gemm() {
+  %ch1 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+  %allocator = tfrt_gpu.allocator.create %context
+  %stream = tfrt_gpu.stream.create %context
+  %blas = tfrt_gpu.blas.create %context
+
+  %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats
+  %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes
+
+  %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64]
+  %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1
+
+  %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32]
+  %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3
+  %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32]
+  %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5
+  %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch7 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch6 [0.0 : f32, 0.0 : f32, 0.0 : f32, 0.0 : f32]
+  %gpu_buffer_2 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch7
+  %ch8 = tfrt_gpu.mem.copy %gpu_buffer_2, %host_buffer, %stream, %ch7 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %dim = tfrt.constant.i32 2
+  %alpha = tfrt.constant.f32 1.0
+  %beta = tfrt.constant.f32 1.0
+  %algo = tfrt_gpu.blas.gemm.algo rocblas_gemm_algo_standard
+  %ch9 = tfrt_gpu.blas.gemm %blas, %stream,
+    rocblas_operation_none, rocblas_operation_none, %dim, %dim, %dim,
+    %alpha, %gpu_buffer_0, rocblas_datatype_f32_r, %dim,
+    %gpu_buffer_1, rocblas_datatype_f32_r, %dim, %beta,
+    %gpu_buffer_2, rocblas_datatype_f32_r, %dim,
+    rocblas_datatype_f32_r, %algo, %ch8
+
+  %ch10 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_2, %stream, %ch9 : !ht.host_buffer, !tfrt_gpu.buffer
+  %ch11 = tfrt_gpu.stream.synchronize %stream, %ch10
+  // CHECK: DenseHostTensor dtype = f32, shape = [2, 2]
+  // CHECK-SAME: values = [1.100000e+01, 1.600000e+01, 1.900000e+01, 2.800000e+01]
+  %ch12 = tfrt_dht.print_tensor %host_tensor, %ch11
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'blas_gemm_batched'
+func @blas_gemm_batched() {
+  %ch1 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+  %allocator = tfrt_gpu.allocator.create %context
+  %stream = tfrt_gpu.stream.create %context
+  %blas = tfrt_gpu.blas.create %context
+
+  %buffer_length = tfrt.constant.i32 4 // [2, 2] = 4 floats
+  %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes
+
+  %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64]
+  %host_buffer, %ch2 = tfrt_dht.get_buffer %host_tensor, %ch1
+
+  %ch3 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch2 [1.0 : f32, 2.0 : f32, 3.0 : f32, 4.0 : f32]
+  %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch3
+  %ch4 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch3 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch5 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch4 [2.0 : f32, 3.0 : f32, 4.0 : f32, 5.0 : f32]
+  %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch5
+  %ch6 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch5 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch7 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch6 [0.0 : f32, 0.0 : f32, 0.0 : f32, 0.0 : f32]
+  %gpu_buffer_2 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch7
+  %ch8 = tfrt_gpu.mem.copy %gpu_buffer_2, %host_buffer, %stream, %ch7 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %dim = tfrt.constant.i32 2
+  %type = tfrt.constant.i32 0
+  %algo = tfrt_gpu.blas.gemm.algo rocblas_gemm_algo_standard
+  %alpha = tfrt.constant.f32 1.0
+  %beta = tfrt.constant.f32 1.0
+  %batch_count = tfrt.constant.i32 1
+  %stride = tfrt.constant.i64 1
+  %ch9 = tfrt_gpu.blas.gemm.batch %blas, %stream,
+    rocblas_operation_none, rocblas_operation_none, %dim, %dim, %dim,
+    %alpha, %gpu_buffer_0, rocblas_datatype_f32_r, %dim, %stride,
+    %gpu_buffer_1, rocblas_datatype_f32_r, %dim, %stride, %beta,
+    %gpu_buffer_2, rocblas_datatype_f32_r, %dim, %stride, %batch_count,
+    rocblas_datatype_f32_r, %algo, %ch8
+
+  %ch10 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_2, %stream, %ch9 : !ht.host_buffer, !tfrt_gpu.buffer
+  %ch11 = tfrt_gpu.stream.synchronize %stream, %ch10
+  // CHECK: DenseHostTensor dtype = f32, shape = [2, 2]
+  // CHECK-SAME: values = [1.100000e+01, 1.600000e+01, 1.900000e+01, 2.800000e+01]
+  %ch12 = tfrt_dht.print_tensor %host_tensor, %ch11
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'blas_trsm_batched'
+func @blas_trsm_batched() {
+  %ch0 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+  %allocator = tfrt_gpu.allocator.create %context
+  %stream = tfrt_gpu.stream.create %context
+  %blas = tfrt_gpu.blas.create %context
+
+  %buffer_size_bytes = tfrt.constant.i64 16 // [2, 2] * 4 bytes floats = 16 bytes
+
+  %host_tensor = tfrt_dht.create_uninitialized_tensor.f32.2 [2 : i64, 2 : i64]
+  %host_buffer, %ch1 = tfrt_dht.get_buffer %host_tensor, %ch0
+
+  %ch2 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch1 [1.0 : f32, 2.0 : f32, 0.0 : f32, 1.0 : f32]
+  %gpu_buffer_0 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch2
+  %ch3 = tfrt_gpu.mem.copy %gpu_buffer_0, %host_buffer, %stream, %ch2 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %ch4 = tfrt_dht.set_tensor_with_constant_values.f32 %host_tensor, %ch3 [1.0 : f32, 4.0 : f32, 0.0 : f32, 0.0 : f32]
+  %gpu_buffer_1 = tfrt_gpu.mem.allocate %allocator, %stream, %buffer_size_bytes, %ch4
+  %ch5 = tfrt_gpu.mem.copy %gpu_buffer_1, %host_buffer, %stream, %ch4 : !tfrt_gpu.buffer, !ht.host_buffer
+
+  %dim = tfrt.constant.i32 2
+  %alpha = tfrt.constant.f32 1.0
+  %batch_count = tfrt.constant.i32 1
+  %ch6 = tfrt_gpu.blas.trsm.batch %blas, %stream, rocblas_side_left,
+    rocblas_fill_lower, rocblas_operation_none, rocblas_diagonal_unit, %dim, %dim,
+    rocblas_datatype_f32_r, %alpha, %gpu_buffer_0, %dim, %gpu_buffer_1, %dim, %batch_count,
+    %ch5
+
+  %ch7 = tfrt_gpu.mem.copy %host_buffer, %gpu_buffer_1, %stream, %ch6 : !ht.host_buffer, !tfrt_gpu.buffer
+  %ch8 = tfrt_gpu.stream.synchronize %stream, %ch7
+  // CHECK: DenseHostTensor dtype = f32, shape = [2, 2]
+  // CHECK-SAME: values = [1.000000e+00, 2.000000e+00, 0.000000e+00, 0.000000e+00]
+  %ch9 = tfrt_dht.print_tensor %host_tensor, %ch8
+
+  tfrt.return
+}
diff --git a/backends/gpu/mlir_tests/rocm/module.mlir b/backends/gpu/mlir_tests/rocm/module.mlir
new file mode 100644
index 00000000000..df2d3be1332
--- /dev/null
+++ b/backends/gpu/mlir_tests/rocm/module.mlir
@@ -0,0 +1,94 @@
+// Copyright 2020 The TensorFlow Runtime Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// RUN: bef_executor_lite %s.bef | FileCheck %s
+
+// CHECK-LABEL: --- Running 'function_test'
+func @function_test() {
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+
+  // PTX for empty kernel.
+  // Typically module loading should be done at initialization time.
+  %module = tfrt_gpu.module.load %context {
+    data = "extern \"C\" __global__ void Kernel() { return; }\00"
+  }
+
+  %func = tfrt_gpu.module.get_function %module { name = "Kernel" }
+
+  tfrt.return
+}
+
+func @global_test() {
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+
+  // PTX for a module with a global symbol.
+  %module = tfrt_gpu.module.load %context {
+    data = "__device__ unsigned int Global[128];\00"
+  }
+
+  %global = tfrt_gpu.module.get_global %module { name = "Global" }
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'module_bad_data_test'
+func @module_bad_data_test() {
+  %ch2 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+
+  // expected-error @+1 {{hipErrorInvalidValue}}
+  %func = tfrt_gpu.module.load %context {
+    data = "invalid image\00"
+  }
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'function_bad_name_test'
+func @function_bad_name_test() {
+  %ch2 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+
+  %module = tfrt_gpu.module.load %context {
+    data = "extern \"C\" __global__ void Kernel() { return; }\00"
+  }
+
+  // expected-error @+1 {{hipErrorNotFound}}
+  %func = tfrt_gpu.module.get_function %module { name = "Foo\00" }
+
+  tfrt.return
+}
+
+// CHECK-LABEL: --- Running 'module_not_null_terminated_test'
+func @module_not_null_terminated_test() {
+  %ch2 = tfrt.new.chain
+  %ordinal = tfrt.constant.i32 0
+  %device = tfrt_gpu.device.get ROCm, %ordinal
+  %context = tfrt_gpu.context.create %device
+
+  // expected-error @+1 {{data attribute must be null-terminated}}
+  %module = tfrt_gpu.module.load %context {
+    data = "not null-terminated"
+  }
+
+  tfrt.return
+}
diff --git a/backends/gpu/tools/stub_codegen/generate.sh b/backends/gpu/tools/stub_codegen/generate.sh
index f9430698bbd..e710ef9e6e6 100755
--- a/backends/gpu/tools/stub_codegen/generate.sh
+++ b/backends/gpu/tools/stub_codegen/generate.sh
@@ -21,7 +21,7 @@
 set -eux
 
 # Build the tools and generate the HIP header.
-bazel build --nocheck_visibility \
+bazel build --nocheck_visibility --config=gcc\
   //backends/gpu/tools/stub_codegen:header_codegen \
   //backends/gpu/tools/stub_codegen:impl_codegen
 
@@ -34,3 +34,10 @@ for API in "hip" "rocblas" "rocsolver" "miopen" "hipfft"; do
    ./bazel-bin/backends/gpu/tools/stub_codegen/impl_codegen \
        $(dirname $0)/$API.json | clang-format > $(printf $SRC_PATH $API)
 done
+
+# Hiprtc is currently rolled up in hip shared library. 
+# It is subject to change in future releases.
+./bazel-bin/backends/gpu/tools/stub_codegen/header_codegen \
+    $(dirname $0)/hiprtc.json | clang-format >> third_party/hip/hip_stub.h.inc
+./bazel-bin/backends/gpu/tools/stub_codegen/impl_codegen \
+    $(dirname $0)/hiprtc.json | clang-format >> third_party/hip/hip_stub.cc.inc
diff --git a/backends/gpu/tools/stub_codegen/hip.json b/backends/gpu/tools/stub_codegen/hip.json
index 677af1ccf36..b1c11b86202 100644
--- a/backends/gpu/tools/stub_codegen/hip.json
+++ b/backends/gpu/tools/stub_codegen/hip.json
@@ -13,6 +13,7 @@
       "hipDeviceAttribute_t",
       "hipJitOption",
       "hipLimit_t",
+      "hipDataType",
       "hipMemoryType",
       "hipMemcpyKind",
       "hipFunction_attribute",
diff --git a/backends/gpu/tools/stub_codegen/hipfft.json b/backends/gpu/tools/stub_codegen/hipfft.json
index 36ab5cd5875..41b6c054af9 100644
--- a/backends/gpu/tools/stub_codegen/hipfft.json
+++ b/backends/gpu/tools/stub_codegen/hipfft.json
@@ -24,5 +24,5 @@
       "hipfftSetStream",
       "hipfftSetWorkArea",
       "hipfftSetAutoAllocation"
-   ],
+   ]
 }
diff --git a/backends/gpu/tools/stub_codegen/hiprtc.json b/backends/gpu/tools/stub_codegen/hiprtc.json
new file mode 100644
index 00000000000..3470f990588
--- /dev/null
+++ b/backends/gpu/tools/stub_codegen/hiprtc.json
@@ -0,0 +1,16 @@
+{
+   "header":"/opt/rocm-5.0.0/include/hip/hiprtc.h",
+   "extra_args":[
+      "-D__HIP_PLATFORM_AMD__",
+      "-I.",
+      "-I/opt/rocm-5.0.0/include/",
+      "-Ithird_party/llvm/llvm-project/clang/lib/Headers",
+      "-Ibazel-genfiles",
+      "-ferror-limit=0"
+   ],
+   "enums":[
+      "hiprtcResult"
+   ],
+   "functions":[
+   ]
+}
diff --git a/backends/gpu/tools/stub_codegen/miopen.json b/backends/gpu/tools/stub_codegen/miopen.json
index 956189c9272..63324e14f40 100644
--- a/backends/gpu/tools/stub_codegen/miopen.json
+++ b/backends/gpu/tools/stub_codegen/miopen.json
@@ -191,5 +191,5 @@
       "miopenSetDropoutDescriptor",
       "miopenDropoutForward",
       "miopenDropoutBackward"
-   ],
+   ]
 }
diff --git a/backends/gpu/tools/stub_codegen/rocblas.json b/backends/gpu/tools/stub_codegen/rocblas.json
index 0ebe21fd122..828f938d9b9 100644
--- a/backends/gpu/tools/stub_codegen/rocblas.json
+++ b/backends/gpu/tools/stub_codegen/rocblas.json
@@ -29,5 +29,5 @@
       "rocblas_dtrsm_batched",
       "rocblas_ctrsm_batched",
       "rocblas_ztrsm_batched"
-   ],
+   ]
 }
diff --git a/backends/gpu/tools/stub_codegen/rocsolver.json b/backends/gpu/tools/stub_codegen/rocsolver.json
index fe4fc6564a2..392a10acd9b 100644
--- a/backends/gpu/tools/stub_codegen/rocsolver.json
+++ b/backends/gpu/tools/stub_codegen/rocsolver.json
@@ -9,5 +9,5 @@
       "rocsolver_dpotrf",
       "rocsolver_cpotrf",
       "rocsolver_zpotrf"
-   ],
+   ]
 }
diff --git a/third_party/hip/hip_stub.h.inc b/third_party/hip/hip_stub.h.inc
index 5e63877c708..119e3dd319b 100644
--- a/third_party/hip/hip_stub.h.inc
+++ b/third_party/hip/hip_stub.h.inc
@@ -453,3 +453,26 @@ hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
                                              const void* f,
                                              size_t dynSharedMemPerBlk,
                                              int blockSizeLimit);
+
+enum hipDataType {
+  HIP_R_16F = 2,
+  HIP_R_32F = 0,
+  HIP_R_64F = 1,
+  HIP_C_16F = 6,
+  HIP_C_32F = 4,
+  HIP_C_64F = 5,
+};
+enum hiprtcResult {
+  HIPRTC_SUCCESS = 0,
+  HIPRTC_ERROR_OUT_OF_MEMORY = 1,
+  HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  HIPRTC_ERROR_INVALID_INPUT = 3,
+  HIPRTC_ERROR_INVALID_PROGRAM = 4,
+  HIPRTC_ERROR_INVALID_OPTION = 5,
+  HIPRTC_ERROR_COMPILATION = 6,
+  HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  HIPRTC_ERROR_INTERNAL_ERROR = 11,
+};
diff --git a/third_party/hip/hipfft_stub.cc.inc b/third_party/hip/hipfft_stub.cc.inc
index ffe4cebd3eb..f0296ba7cd8 100644
--- a/third_party/hip/hipfft_stub.cc.inc
+++ b/third_party/hip/hipfft_stub.cc.inc
@@ -1,3 +1,8 @@
+HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan) {
+  return DynamicCall<decltype(hipfftCreate), &hipfftCreate>("hipfftCreate",
+                                                            plan);
+}
+
 HIPFFT_EXPORT hipfftResult hipfftMakePlanMany64(
     hipfftHandle plan, int rank, long long int* n, long long int* inembed,
     long long int istride, long long int idist, long long int* onembed,
@@ -13,17 +18,19 @@ HIPFFT_EXPORT hipfftResult hipfftGetSize(hipfftHandle plan, size_t* workSize) {
                                                               plan, workSize);
 }
 
+HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan,
+                                                   int autoAllocate) {
+  return DynamicCall<decltype(hipfftSetAutoAllocation),
+                     &hipfftSetAutoAllocation>("hipfftSetAutoAllocation", plan,
+                                               autoAllocate);
+}
+
 HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan,
                                              void* workArea) {
   return DynamicCall<decltype(hipfftSetWorkArea), &hipfftSetWorkArea>(
       "hipfftSetWorkArea", plan, workArea);
 }
 
-HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, int enable) {
-  return DynamicCall<decltype(hipfftSetAutoAllocation), &hipfftSetAutoAllocation>(
-      "hipfftSetAutoAllocation", plan, enable);
-}
-
 HIPFFT_EXPORT hipfftResult hipfftExecC2C(hipfftHandle plan,
                                          hipfftComplex* idata,
                                          hipfftComplex* odata, int direction) {
@@ -72,11 +79,6 @@ HIPFFT_EXPORT hipfftResult hipfftSetStream(hipfftHandle plan,
       "hipfftSetStream", plan, stream);
 }
 
-HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan) {
-  return DynamicCall<decltype(hipfftCreate), &hipfftCreate>("hipfftCreate",
-                                                            plan);
-}
-
 HIPFFT_EXPORT hipfftResult hipfftDestroy(hipfftHandle plan) {
   return DynamicCall<decltype(hipfftDestroy), &hipfftDestroy>("hipfftDestroy",
                                                               plan);
diff --git a/third_party/hip/hipfft_stub.h.inc b/third_party/hip/hipfft_stub.h.inc
index fee4121a479..cc70b112986 100644
--- a/third_party/hip/hipfft_stub.h.inc
+++ b/third_party/hip/hipfft_stub.h.inc
@@ -32,17 +32,20 @@ typedef enum hipfftLibraryPropertyType_t {
   HIPFFT_PATCH_LEVEL,
 } hipfftLibraryPropertyType;
 
+HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan);
+
 HIPFFT_EXPORT hipfftResult hipfftMakePlanMany64(
     hipfftHandle plan, int rank, long long int* n, long long int* inembed,
     long long int istride, long long int idist, long long int* onembed,
     long long int ostride, long long int odist, hipfftType type,
-    long long int batch, size_t* work_size);
+    long long int batch, size_t* workSize);
 
 HIPFFT_EXPORT hipfftResult hipfftGetSize(hipfftHandle plan, size_t* workSize);
 
-HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan, void* workArea);
+HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan,
+                                                   int autoAllocate);
 
-HIPFFT_EXPORT hipfftResult hipfftSetAutoAllocation(hipfftHandle plan, int enable);
+HIPFFT_EXPORT hipfftResult hipfftSetWorkArea(hipfftHandle plan, void* workArea);
 
 HIPFFT_EXPORT hipfftResult hipfftExecC2C(hipfftHandle plan,
                                          hipfftComplex* idata,
@@ -71,8 +74,6 @@ HIPFFT_EXPORT hipfftResult hipfftExecZ2D(hipfftHandle plan,
 HIPFFT_EXPORT hipfftResult hipfftSetStream(hipfftHandle plan,
                                            hipStream_t stream);
 
-HIPFFT_EXPORT hipfftResult hipfftCreate(hipfftHandle* plan);
-
 HIPFFT_EXPORT hipfftResult hipfftDestroy(hipfftHandle plan);
 
 HIPFFT_EXPORT hipfftResult hipfftGetProperty(hipfftLibraryPropertyType type,