From e4d9e7b37743a3fef72e97b54e79f8f4615369ab Mon Sep 17 00:00:00 2001
From: Daine McNiven <51674140+daineAMD@users.noreply.github.com>
Date: Fri, 18 Sep 2020 16:27:48 -0600
Subject: [PATCH] Missing types master (#258)

---
 clients/common/cblas_interface.cpp            | 120 ++++++
 clients/gtest/gemm_ex_gtest.cpp               | 124 ++++++-
 clients/include/cblas_interface.h             |  12 +-
 clients/include/testing_gemm_batched.hpp      |   6 +-
 clients/include/testing_gemm_batched_ex.hpp   | 342 +++++++++++-------
 clients/include/testing_gemm_ex.hpp           | 312 ++++++++++------
 .../testing_gemm_strided_batched_ex.hpp       | 312 ++++++++++------
 clients/include/testing_getrs.hpp             |   2 +-
 clients/include/testing_getrs_batched.hpp     |   2 +-
 .../include/testing_getrs_strided_batched.hpp |   2 +-
 clients/include/testing_tpsv.hpp              |  15 +-
 clients/include/testing_tpsv_batched.hpp      |  14 +-
 .../include/testing_tpsv_strided_batched.hpp  |   2 +-
 clients/include/testing_trsv.hpp              |   4 +-
 clients/include/testing_trsv_batched.hpp      |  15 +-
 .../include/testing_trsv_strided_batched.hpp  |   3 +-
 clients/include/utility.h                     |  17 +
 library/src/hcc_detail/hipblas.cpp            |  12 +
 library/src/nvcc_detail/hipblas.cpp           |   6 +
 19 files changed, 945 insertions(+), 377 deletions(-)
diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp
index 978433869..269063276 100644
--- a/clients/common/cblas_interface.cpp
+++ b/clients/common/cblas_interface.cpp
@@ -2085,6 +2085,68 @@ void cblas_gemm<hipblasHalf>(hipblasOperation_t transA,
     }
 }
 
+template <>
+void cblas_gemm<hipblasHalf, hipblasHalf, float>(hipblasOperation_t transA,
+                                                 hipblasOperation_t transB,
+                                                 int                m,
+                                                 int                n,
+                                                 int                k,
+                                                 float              alpha_float,
+                                                 hipblasHalf*       A,
+                                                 int                lda,
+                                                 hipblasHalf*       B,
+                                                 int                ldb,
+                                                 float              beta_float,
+                                                 hipblasHalf*       C,
+                                                 int                ldc)
+{
+    // cblas does not support hipblasHalf, so convert to higher precision float
+    // This will give more precise result which is acceptable for testing
+
+    int sizeA = transA == HIPBLAS_OP_N ? k * lda : m * lda;
+    int sizeB = transB == HIPBLAS_OP_N ? n * ldb : k * ldb;
+    int sizeC = n * ldc;
+
+    std::unique_ptr<float[]> A_float(new float[sizeA]());
+    std::unique_ptr<float[]> B_float(new float[sizeB]());
+    std::unique_ptr<float[]> C_float(new float[sizeC]());
+
+    for(int i = 0; i < sizeA; i++)
+    {
+        A_float[i] = half_to_float(A[i]);
+    }
+    for(int i = 0; i < sizeB; i++)
+    {
+        B_float[i] = half_to_float(B[i]);
+    }
+    for(int i = 0; i < sizeC; i++)
+    {
+        C_float[i] = half_to_float(C[i]);
+    }
+
+    // just directly cast, since transA, transB are integers in the enum
+    // printf("transA: rocblas =%d, cblas=%d\n", transA, (CBLAS_TRANSPOSE)transA );
+    cblas_sgemm(CblasColMajor,
+                (CBLAS_TRANSPOSE)transA,
+                (CBLAS_TRANSPOSE)transB,
+                m,
+                n,
+                k,
+                alpha_float,
+                const_cast<const float*>(A_float.get()),
+                lda,
+                const_cast<const float*>(B_float.get()),
+                ldb,
+                beta_float,
+                static_cast<float*>(C_float.get()),
+                ldc);
+
+    for(int i = 0; i < sizeC; i++)
+    {
+        C[i] = float_to_half(C_float[i]);
+    }
+}
+
 template <>
 void cblas_gemm<float>(hipblasOperation_t transA,
                        hipblasOperation_t transB,
@@ -2212,6 +2274,64 @@ void cblas_gemm<hipblasDoubleComplex>(hipblasOperation_t    transA,
                 ldc);
 }
 
+template <>
+void cblas_gemm<int8_t, int32_t, int32_t>(hipblasOperation_t transA,
+                                          hipblasOperation_t transB,
+                                          int                m,
+                                          int                n,
+                                          int                k,
+                                          int32_t            alpha,
+                                          int8_t*            A,
+                                          int                lda,
+                                          int8_t*            B,
+                                          int                ldb,
+                                          int32_t            beta,
+                                          int32_t*           C,
+                                          int                ldc)
+{
+    double alpha_double = static_cast<double>(alpha);
+    double beta_double  = static_cast<double>(beta);
+
+    size_t const sizeA = ((transA == HIPBLAS_OP_N) ? k : m) * size_t(lda);
+    size_t const sizeB = ((transB == HIPBLAS_OP_N) ? n : k) * size_t(ldb);
+    size_t const sizeC = n * size_t(ldc);
+
+    std::unique_ptr<double[]> A_double(new double[sizeA]());
+    std::unique_ptr<double[]> B_double(new double[sizeB]());
+    std::unique_ptr<double[]> C_double(new double[sizeC]());
+
+    for(int i = 0; i < sizeA; i++)
+    {
+        A_double[i] = static_cast<double>(A[i]);
+    }
+    for(int i = 0; i < sizeB; i++)
+    {
+        B_double[i] = static_cast<double>(B[i]);
+    }
+    for(int i = 0; i < sizeC; i++)
+    {
+        C_double[i] = static_cast<double>(C[i]);
+    }
+
+    cblas_dgemm(CblasColMajor,
+                static_cast<CBLAS_TRANSPOSE>(transA),
+                static_cast<CBLAS_TRANSPOSE>(transB),
+                m,
+                n,
+                k,
+                alpha_double,
+                const_cast<const double*>(A_double.get()),
+                lda,
+                const_cast<const double*>(B_double.get()),
+                ldb,
+                beta_double,
+                static_cast<double*>(C_double.get()),
+                ldc);
+
+    for(size_t i = 0; i < sizeC; i++)
+        C[i] = static_cast<int32_t>(C_double[i]);
+}
+
 // hemm
 template <>
 void cblas_hemm(hipblasSideMode_t side,
diff --git a/clients/gtest/gemm_ex_gtest.cpp b/clients/gtest/gemm_ex_gtest.cpp
index 42bf9f04d..1a89a9abe 100644
--- a/clients/gtest/gemm_ex_gtest.cpp
+++ b/clients/gtest/gemm_ex_gtest.cpp
@@ -33,6 +33,17 @@ typedef std::tuple<vector<int>, vector<double>, vector<char>, vector<hipblasData
 // clang-format off
 // vector of vector, each vector is a {M, N, K, lda, ldb, ldc};
 // add/delete as a group
+const vector<vector<int>> int8_matrix_size_range = {
+
+    { 4,  4,  4,  4,  4,  4},
+    { 8,  8,  8,  8,  8,  8},
+    {12, 12, 12, 12, 12, 12},
+    {16, 16, 16, 16, 16, 16},
+    {20, 20, 20, 20, 20, 20},
+    { 8,  4,  4,  8,  8,  8},
+    { 8, 12, 12, 12, 12, 12},
+};
+
 const vector<vector<int>> small_matrix_size_range = {
     { 1,  1,  1,  1,  1,  1},
     { 1,  2,  3,  4,  5,  6},
@@ -169,6 +180,12 @@ const vector<vector<double>> small_alpha_beta_range = {
 const vector<vector<double>> full_alpha_beta_range = {
     {1.0, 0.0}, {-1.0, -1.0}, {2.0, 1.0}, {0.0, 1.0}};
 
+// For Cuda v < 10.0, only alpha and beta = 1 or = 0 are
+// supported.
+const vector<vector<double>> alpha_beta_range_int8 = {
+    {1.0, 1.0}, {1.0, 0.0},
+};
+
 // vector of vector, each pair is a {transA, transB};
 // add/delete this list in pairs, like {'N', 'T'}
 // for single/double precision, 'C'(conjTranspose) will downgraded to 'T' (transpose) internally in
@@ -201,6 +218,25 @@ const vector<vector<hipblasDatatype_t>> precision_double = {{ HIPBLAS_R_64F,
                                                               HIPBLAS_R_64F,
                                                               HIPBLAS_R_64F  }};
 
+const vector<vector<hipblasDatatype_t>> precision_single_complex = {{ HIPBLAS_C_32F,
+                                                              HIPBLAS_C_32F,
+                                                              HIPBLAS_C_32F,
+                                                              HIPBLAS_C_32F,
+                                                              HIPBLAS_C_32F  }};
+
+const vector<vector<hipblasDatatype_t>> precision_double_complex = {{ HIPBLAS_C_64F,
+                                                              HIPBLAS_C_64F,
+                                                              HIPBLAS_C_64F,
+                                                              HIPBLAS_C_64F,
+                                                              HIPBLAS_C_64F  }};
+
+const vector<vector<hipblasDatatype_t>> precision_int8 = {{ HIPBLAS_R_8I,
+                                                              HIPBLAS_R_8I,
+                                                              HIPBLAS_R_32I,
+                                                              HIPBLAS_R_32I,
+                                                              HIPBLAS_R_32I  }};
+
+
 const vector<vector<hipblasDatatype_t>> precision_type_range = {{HIPBLAS_R_16F,
                                                                  HIPBLAS_R_16F,
                                                                  HIPBLAS_R_16F,
@@ -220,7 +256,22 @@ const vector<vector<hipblasDatatype_t>> precision_type_range = {{HIPBLAS_R_16F,
                                                                  HIPBLAS_R_64F,
                                                                  HIPBLAS_R_64F,
                                                                  HIPBLAS_R_64F,
-                                                                 HIPBLAS_R_64F}};
+                                                                 HIPBLAS_R_64F},
+                                                                {HIPBLAS_C_32F,
+                                                                 HIPBLAS_C_32F,
+                                                                 HIPBLAS_C_32F,
+                                                                 HIPBLAS_C_32F,
+                                                                 HIPBLAS_C_32F},
+                                                                {HIPBLAS_C_64F,
+                                                                 HIPBLAS_C_64F,
+                                                                 HIPBLAS_C_64F,
+                                                                 HIPBLAS_C_64F,
+                                                                 HIPBLAS_C_64F},
+                                                                {HIPBLAS_R_8I,
+                                                                 HIPBLAS_R_8I,
+                                                                 HIPBLAS_R_32I,
+                                                                 HIPBLAS_R_32I,
+                                                                 HIPBLAS_R_32I}};
 
 const int batch_count_range[] = { -1, 1, 5 };
 const int batch_count_range_small[] = { 1 };
@@ -364,13 +415,19 @@ TEST_P(parameterized_gemm_batched_ex, standard_batched)
         {
             EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status);
         }
-        else
+        else if(status == HIPBLAS_STATUS_ARCH_MISMATCH)
         {
             // Only available in cuda cc >= 5.0.
             // If we want we can change this to call query_device_property() and
             // call this only if cc < 5.0 on a CUDA device, else fail.
             EXPECT_EQ(HIPBLAS_STATUS_ARCH_MISMATCH, status);
         }
+        else
+        {
+            // cublas/rocblas do not have identical support
+            // (i.e. cublas doesn't support i8/i32 here)
+            EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status);
+        }
     }
 }
 
@@ -400,13 +457,19 @@ TEST_P(parameterized_gemm_batched_ex, standard_strided_batched)
         {
             EXPECT_EQ(HIPBLAS_STATUS_INVALID_VALUE, status);
         }
-        else
+        else if(status == HIPBLAS_STATUS_ARCH_MISMATCH)
         {
             // Only available in cuda cc >= 5.0.
             // If we want we can change this to call query_device_property() and
             // call this only if cc < 5.0 on a CUDA device, else fail.
             EXPECT_EQ(HIPBLAS_STATUS_ARCH_MISMATCH, status);
         }
+        else
+        {
+            // cublas/rocblas do not have identical support
+            // (i.e. cublas doesn't support i8/i32 here)
+            EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status);
+        }
     }
 }
 
@@ -502,6 +565,34 @@ INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_double,
                                 ValuesIn(precision_double),
                                 ValuesIn(batch_count_range_small),
                                 ValuesIn(is_fortran)));
+
+INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_single_complex,
+                        parameterized_gemm_ex,
+                        Combine(ValuesIn(small_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_single_complex),
+                                ValuesIn(batch_count_range_small),
+                                ValuesIn(is_fortran)));
+
+INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_double_complex,
+                        parameterized_gemm_ex,
+                        Combine(ValuesIn(small_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_double_complex),
+                                ValuesIn(batch_count_range_small),
+                                ValuesIn(is_fortran)));
+
+INSTANTIATE_TEST_CASE_P(quick_blas_ex_small_int8,
+                        parameterized_gemm_ex,
+                        Combine(ValuesIn(int8_matrix_size_range),
+                                ValuesIn(alpha_beta_range_int8),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_int8),
+                                ValuesIn(batch_count_range_small),
+                                ValuesIn(is_fortran)));
+
 //----medium
 INSTANTIATE_TEST_CASE_P(pre_checkin_blas_ex_medium_hpa_half,
                         parameterized_gemm_ex,
@@ -575,3 +666,30 @@ INSTANTIATE_TEST_CASE_P(quick_blas_batched_ex_small_double,
                                 ValuesIn(precision_double),
                                 ValuesIn(batch_count_range),
                                 ValuesIn(is_fortran)));
+
+INSTANTIATE_TEST_CASE_P(quick_blas_batched_ex_small_single_complex,
+                        parameterized_gemm_batched_ex,
+                        Combine(ValuesIn(small_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_single_complex),
+                                ValuesIn(batch_count_range),
+                                ValuesIn(is_fortran)));
+
+INSTANTIATE_TEST_CASE_P(quick_blas_batched_ex_small_double_complex,
+                        parameterized_gemm_batched_ex,
+                        Combine(ValuesIn(small_matrix_size_range),
+                                ValuesIn(alpha_beta_range),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_double_complex),
+                                ValuesIn(batch_count_range),
+                                ValuesIn(is_fortran)));
+
+INSTANTIATE_TEST_CASE_P(quick_blas_batched_ex_small_int8,
+                        parameterized_gemm_batched_ex,
+                        Combine(ValuesIn(int8_matrix_size_range),
+                                ValuesIn(alpha_beta_range_int8),
+                                ValuesIn(transA_transB_range),
+                                ValuesIn(precision_int8),
+                                ValuesIn(batch_count_range),
+                                ValuesIn(is_fortran)));
diff --git a/clients/include/cblas_interface.h b/clients/include/cblas_interface.h
index 9382491e6..bb7cd7021 100644
--- a/clients/include/cblas_interface.h
+++ b/clients/include/cblas_interface.h
@@ -312,19 +312,19 @@ void cblas_geam(hipblasOperation_t transa,
                 int                ldc);
 
 // gemm
-template <typename T>
+template <typename Ti, typename To = Ti, typename Tc = To>
 void cblas_gemm(hipblasOperation_t transA,
                 hipblasOperation_t transB,
                 int                m,
                 int                n,
                 int                k,
-                T                  alpha,
-                T*                 A,
+                Tc                 alpha,
+                Ti*                A,
                 int                lda,
-                T*                 B,
+                Ti*                B,
                 int                ldb,
-                T                  beta,
-                T*                 C,
+                Tc                 beta,
+                To*                C,
                 int                ldc);
 
 // hemm
diff --git a/clients/include/testing_gemm_batched.hpp b/clients/include/testing_gemm_batched.hpp
index 51b042307..3ee6b8272 100644
--- a/clients/include/testing_gemm_batched.hpp
+++ b/clients/include/testing_gemm_batched.hpp
@@ -220,12 +220,12 @@ hipblasStatus_t testing_GemmBatched(Arguments argus)
                       N,
                       K,
                       h_alpha,
-                      hA_array[i],
+                      (T*)hA_array[i],
                       lda,
-                      hB_array[i],
+                      (T*)hB_array[i],
                       ldb,
                       h_beta,
-                      hC_copy_array[i],
+                      (T*)hC_copy_array[i],
                       ldc);
     }
 
diff --git a/clients/include/testing_gemm_batched_ex.hpp b/clients/include/testing_gemm_batched_ex.hpp
index b1c696e98..5482a1122 100644
--- a/clients/include/testing_gemm_batched_ex.hpp
+++ b/clients/include/testing_gemm_batched_ex.hpp
@@ -24,7 +24,7 @@ using namespace std;
 
 /* ============================================================================================ */
 
-template <typename Td, typename Tc>
+template <typename Ta, typename Tb = Ta, typename Tc = Tb, typename Tex = Tc>
 hipblasStatus_t testing_gemm_batched_ex_template(hipblasOperation_t transA,
                                                  hipblasOperation_t transB,
                                                  int                M,
@@ -50,46 +50,38 @@ hipblasStatus_t testing_gemm_batched_ex_template(hipblasOperation_t transA,
     uint32_t          solution_index = 0;
     uint32_t          flags          = 0;
 
-    Td h_alpha_Td;
-    Td h_beta_Td;
+    Tex h_alpha_Tc;
+    Tex h_beta_Tc;
 
-    if(is_same<Td, hipblasHalf>::value)
+    if(is_same<Tex, hipblasHalf>::value)
     {
-        h_alpha_Td = float_to_half(alpha_float);
-        h_beta_Td  = float_to_half(beta_float);
+        h_alpha_Tc = float_to_half(alpha_float);
+        h_beta_Tc  = float_to_half(beta_float);
     }
-    else if(is_same<Td, float>::value)
+    else if(is_same<Tex, float>::value)
     {
-        h_alpha_Td = static_cast<Td>(alpha_float);
-        h_beta_Td  = static_cast<Td>(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-    else if(is_same<Td, double>::value)
+    else if(is_same<Tex, double>::value)
     {
-        h_alpha_Td = static_cast<Td>(alpha_float);
-        h_beta_Td  = static_cast<Td>(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-    else
+    else if(is_same<Tex, hipblasComplex>::value)
     {
-        return HIPBLAS_STATUS_NOT_SUPPORTED;
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-
-    Tc h_alpha_Tc;
-    Tc h_beta_Tc;
-
-    if(is_same<Tc, hipblasHalf>::value)
+    else if(is_same<Tex, hipblasDoubleComplex>::value)
     {
-        h_alpha_Tc = float_to_half(alpha_float);
-        h_beta_Tc  = float_to_half(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-    else if(is_same<Tc, float>::value)
+    else if(is_same<Tex, int32_t>::value)
     {
-        h_alpha_Tc = static_cast<Tc>(alpha_float);
-        h_beta_Tc  = static_cast<Tc>(beta_float);
-    }
-    else if(is_same<Tc, double>::value)
-    {
-        h_alpha_Tc = static_cast<Tc>(alpha_float);
-        h_beta_Tc  = static_cast<Tc>(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
     else
     {
@@ -111,15 +103,15 @@ hipblasStatus_t testing_gemm_batched_ex_template(hipblasOperation_t transA,
     const size_t size_B = static_cast<size_t>(ldb) * static_cast<size_t>(B_col);
     const size_t size_C = static_cast<size_t>(ldc) * static_cast<size_t>(N);
 
-    device_vector<Td*, 0, Td> dA(batch_count);
-    device_vector<Td*, 0, Td> dB(batch_count);
-    device_vector<Td*, 0, Td> dC(batch_count);
-    device_vector<Tc>         d_alpha_Tc(1);
-    device_vector<Tc>         d_beta_Tc(1);
+    device_vector<Ta*, 0, Ta> dA(batch_count);
+    device_vector<Tb*, 0, Tb> dB(batch_count);
+    device_vector<Tc*, 0, Tc> dC(batch_count);
+    device_vector<Tex>        d_alpha_Tc(1);
+    device_vector<Tex>        d_beta_Tc(1);
 
-    device_batch_vector<Td> bA(batch_count, size_A);
-    device_batch_vector<Td> bB(batch_count, size_B);
-    device_batch_vector<Td> bC(batch_count, size_C);
+    device_batch_vector<Ta> bA(batch_count, size_A);
+    device_batch_vector<Tb> bB(batch_count, size_B);
+    device_batch_vector<Tc> bC(batch_count, size_C);
 
     int last = batch_count - 1;
     if(!dA || !dB || !dC || !bA[last] || !bB[last] || !bC[last])
@@ -133,34 +125,61 @@ hipblasStatus_t testing_gemm_batched_ex_template(hipblasOperation_t transA,
     hipblasCreate(&handle);
 
     // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory
-    host_vector<Td> hA[batch_count];
-    host_vector<Td> hB[batch_count];
-    host_vector<Td> hC[batch_count];
-    host_vector<Td> hC_gold[batch_count];
+    host_vector<Ta> hA[batch_count];
+    host_vector<Tb> hB[batch_count];
+    host_vector<Tc> hC[batch_count];
+    host_vector<Tc> hC_gold[batch_count];
 
     // Initial Data on CPU
     srand(1);
     for(int b = 0; b < batch_count; b++)
     {
-        hA[b]      = host_vector<Td>(size_A);
-        hB[b]      = host_vector<Td>(size_B);
-        hC[b]      = host_vector<Td>(size_C);
-        hC_gold[b] = host_vector<Td>(size_C);
+        hA[b]      = host_vector<Ta>(size_A);
+        hB[b]      = host_vector<Tb>(size_B);
+        hC[b]      = host_vector<Tc>(size_C);
+        hC_gold[b] = host_vector<Tc>(size_C);
 
-        hipblas_init<Td>(hA[b], A_row, A_col, lda);
-        hipblas_init_alternating_sign<Td>(hB[b], B_row, B_col, ldb);
-        hipblas_init<Td>(hC[b], M, N, ldc);
+        hipblas_init<Ta>(hA[b], A_row, A_col, lda);
+        hipblas_init_alternating_sign<Tb>(hB[b], B_row, B_col, ldb);
+        hipblas_init<Tc>(hC[b], M, N, ldc);
 
         hC_gold[b] = hC[b];
-
-        CHECK_HIP_ERROR(hipMemcpy(bA[b], hA[b].data(), sizeof(Td) * size_A, hipMemcpyHostToDevice));
-        CHECK_HIP_ERROR(hipMemcpy(bB[b], hB[b].data(), sizeof(Td) * size_B, hipMemcpyHostToDevice));
-        CHECK_HIP_ERROR(hipMemcpy(bC[b], hC[b].data(), sizeof(Td) * size_C, hipMemcpyHostToDevice));
+#ifdef __HIP_PLATFORM_NVCC__
+        CHECK_HIP_ERROR(hipMemcpy(bA[b], hA[b].data(), sizeof(Ta) * size_A, hipMemcpyHostToDevice));
+        CHECK_HIP_ERROR(hipMemcpy(bB[b], hB[b].data(), sizeof(Tb) * size_B, hipMemcpyHostToDevice));
+#else
+        if(std::is_same<Ta, int8_t>{} && transA == HIPBLAS_OP_N)
+        {
+            vector<Ta> hA_packed(hA[b]);
+            hipblas_packInt8(hA_packed, M, K, lda);
+            CHECK_HIP_ERROR(
+                hipMemcpy(bA[b], hA_packed.data(), sizeof(Ta) * size_A, hipMemcpyHostToDevice));
+        }
+        else
+        {
+            CHECK_HIP_ERROR(
+                hipMemcpy(bA[b], hA[b].data(), sizeof(Ta) * size_A, hipMemcpyHostToDevice));
+        }
+
+        if(std::is_same<Tb, int8_t>{} && transB != HIPBLAS_OP_N)
+        {
+            vector<Tb> hB_packed(hB[b]);
+            hipblas_packInt8(hB_packed, N, K, ldb);
+            CHECK_HIP_ERROR(
+                hipMemcpy(bB[b], hB_packed.data(), sizeof(Tb) * size_B, hipMemcpyHostToDevice));
+        }
+        else
+        {
+            CHECK_HIP_ERROR(
+                hipMemcpy(bB[b], hB[b].data(), sizeof(Tb) * size_B, hipMemcpyHostToDevice));
+        }
+#endif
+        CHECK_HIP_ERROR(hipMemcpy(bC[b], hC[b].data(), sizeof(Tc) * size_C, hipMemcpyHostToDevice));
     }
 
-    CHECK_HIP_ERROR(hipMemcpy(dA, bA, sizeof(Td*) * batch_count, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(dB, bB, sizeof(Td*) * batch_count, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(dC, bC, sizeof(Td*) * batch_count, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(dA, bA, sizeof(Ta*) * batch_count, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(dB, bB, sizeof(Tb*) * batch_count, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(dC, bC, sizeof(Tc*) * batch_count, hipMemcpyHostToDevice));
 
     status = hipblasGemmBatchedExFn(handle,
                                     transA,
@@ -169,14 +188,14 @@ hipblasStatus_t testing_gemm_batched_ex_template(hipblasOperation_t transA,
                                     N,
                                     K,
                                     &h_alpha_Tc,
-                                    (const void**)(Td**)dA,
+                                    (const void**)(Ta**)dA,
                                     a_type,
                                     lda,
-                                    (const void**)(Td**)dB,
+                                    (const void**)(Tb**)dB,
                                     b_type,
                                     ldb,
                                     &h_beta_Tc,
-                                    (void**)(Td**)dC,
+                                    (void**)(Tc**)dC,
                                     c_type,
                                     ldc,
                                     batch_count,
@@ -192,25 +211,25 @@ hipblasStatus_t testing_gemm_batched_ex_template(hipblasOperation_t transA,
 
     for(int b = 0; b < batch_count; b++)
     {
-        CHECK_HIP_ERROR(hipMemcpy(hC[b].data(), bC[b], sizeof(Td) * size_C, hipMemcpyDeviceToHost));
+        CHECK_HIP_ERROR(hipMemcpy(hC[b].data(), bC[b], sizeof(Tc) * size_C, hipMemcpyDeviceToHost));
     }
 
     // CPU BLAS
     for(int b = 0; b < batch_count; b++)
     {
-        cblas_gemm<Td>(transA,
-                       transB,
-                       M,
-                       N,
-                       K,
-                       h_alpha_Td,
-                       hA[b].data(),
-                       lda,
-                       hB[b].data(),
-                       ldb,
-                       h_beta_Td,
-                       hC_gold[b].data(),
-                       ldc);
+        cblas_gemm<Ta, Tc, Tex>(transA,
+                                transB,
+                                M,
+                                N,
+                                K,
+                                h_alpha_Tc,
+                                hA[b].data(),
+                                lda,
+                                hB[b].data(),
+                                ldb,
+                                h_beta_Tc,
+                                hC_gold[b].data(),
+                                ldc);
     }
 
     // enable unit check, notice unit check is not invasive, but norm check is,
@@ -218,7 +237,7 @@ hipblasStatus_t testing_gemm_batched_ex_template(hipblasOperation_t transA,
     if(unit_check)
     {
         for(int b = 0; b < batch_count; b++)
-            unit_check_general<Td>(M, N, ldc, hC_gold[b].data(), hC[b].data());
+            unit_check_general<Tc>(M, N, ldc, hC_gold[b].data(), hC[b].data());
     }
 
     hipblasDestroy(handle);
@@ -257,73 +276,96 @@ hipblasStatus_t testing_gemm_batched_ex(Arguments argus)
     if(a_type == HIPBLAS_R_16F && b_type == HIPBLAS_R_16F && c_type == HIPBLAS_R_16F
        && c_type == HIPBLAS_R_16F && compute_type == HIPBLAS_R_16F)
     {
-        status = testing_gemm_batched_ex_template<hipblasHalf, hipblasHalf>(transA,
-                                                                            transB,
-                                                                            M,
-                                                                            N,
-                                                                            K,
-                                                                            alpha,
-                                                                            lda,
-                                                                            ldb,
-                                                                            beta,
-                                                                            ldc,
-                                                                            norm_check,
-                                                                            unit_check,
-                                                                            a_type,
-                                                                            b_type,
-                                                                            c_type,
-                                                                            batch_count,
-                                                                            compute_type,
-                                                                            argus.fortran);
+        status = testing_gemm_batched_ex_template<hipblasHalf>(transA,
+                                                               transB,
+                                                               M,
+                                                               N,
+                                                               K,
+                                                               alpha,
+                                                               lda,
+                                                               ldb,
+                                                               beta,
+                                                               ldc,
+                                                               norm_check,
+                                                               unit_check,
+                                                               a_type,
+                                                               b_type,
+                                                               c_type,
+                                                               batch_count,
+                                                               compute_type,
+                                                               argus.fortran);
     }
     else if(a_type == HIPBLAS_R_16F && b_type == HIPBLAS_R_16F && c_type == HIPBLAS_R_16F
             && c_type == HIPBLAS_R_16F && compute_type == HIPBLAS_R_32F)
     {
-        status = testing_gemm_batched_ex_template<hipblasHalf, float>(transA,
-                                                                      transB,
-                                                                      M,
-                                                                      N,
-                                                                      K,
-                                                                      alpha,
-                                                                      lda,
-                                                                      ldb,
-                                                                      beta,
-                                                                      ldc,
-                                                                      norm_check,
-                                                                      unit_check,
-                                                                      a_type,
-                                                                      b_type,
-                                                                      c_type,
-                                                                      batch_count,
-                                                                      compute_type,
-                                                                      argus.fortran);
+        status = testing_gemm_batched_ex_template<hipblasHalf, hipblasHalf, hipblasHalf, float>(
+            transA,
+            transB,
+            M,
+            N,
+            K,
+            alpha,
+            lda,
+            ldb,
+            beta,
+            ldc,
+            norm_check,
+            unit_check,
+            a_type,
+            b_type,
+            c_type,
+            batch_count,
+            compute_type,
+            argus.fortran);
     }
     else if(a_type == HIPBLAS_R_32F && b_type == HIPBLAS_R_32F && c_type == HIPBLAS_R_32F
             && c_type == HIPBLAS_R_32F && compute_type == HIPBLAS_R_32F)
     {
-        status = testing_gemm_batched_ex_template<float, float>(transA,
-                                                                transB,
-                                                                M,
-                                                                N,
-                                                                K,
-                                                                alpha,
-                                                                lda,
-                                                                ldb,
-                                                                beta,
-                                                                ldc,
-                                                                norm_check,
-                                                                unit_check,
-                                                                a_type,
-                                                                b_type,
-                                                                c_type,
-                                                                batch_count,
-                                                                compute_type,
-                                                                argus.fortran);
+        status = testing_gemm_batched_ex_template<float>(transA,
+                                                         transB,
+                                                         M,
+                                                         N,
+                                                         K,
+                                                         alpha,
+                                                         lda,
+                                                         ldb,
+                                                         beta,
+                                                         ldc,
+                                                         norm_check,
+                                                         unit_check,
+                                                         a_type,
+                                                         b_type,
+                                                         c_type,
+                                                         batch_count,
+                                                         compute_type,
+                                                         argus.fortran);
     }
     else if(a_type == HIPBLAS_R_64F && b_type == HIPBLAS_R_64F && c_type == HIPBLAS_R_64F
             && c_type == HIPBLAS_R_64F && compute_type == HIPBLAS_R_64F)
     {
-        status = testing_gemm_batched_ex_template<double, double>(transA,
+        status = testing_gemm_batched_ex_template<double>(transA,
+                                                          transB,
+                                                          M,
+                                                          N,
+                                                          K,
+                                                          alpha,
+                                                          lda,
+                                                          ldb,
+                                                          beta,
+                                                          ldc,
+                                                          norm_check,
+                                                          unit_check,
+                                                          a_type,
+                                                          b_type,
+                                                          c_type,
+                                                          batch_count,
+                                                          compute_type,
+                                                          argus.fortran);
+    }
+    else if(a_type == HIPBLAS_C_32F && b_type == HIPBLAS_C_32F && c_type == HIPBLAS_C_32F
+            && c_type == HIPBLAS_C_32F && compute_type == HIPBLAS_C_32F)
+    {
+        status = testing_gemm_batched_ex_template<hipblasComplex>(transA,
                                                                   transB,
                                                                   M,
                                                                   N,
@@ -342,6 +384,50 @@ hipblasStatus_t testing_gemm_batched_ex(Arguments argus)
                                                                   compute_type,
                                                                   argus.fortran);
     }
+    else if(a_type == HIPBLAS_C_64F && b_type == HIPBLAS_C_64F && c_type == HIPBLAS_C_64F
+            && c_type == HIPBLAS_C_64F && compute_type == HIPBLAS_C_64F)
+    {
+        status = testing_gemm_batched_ex_template<hipblasDoubleComplex>(transA,
+                                                                        transB,
+                                                                        M,
+                                                                        N,
+                                                                        K,
+                                                                        alpha,
+                                                                        lda,
+                                                                        ldb,
+                                                                        beta,
+                                                                        ldc,
+                                                                        norm_check,
+                                                                        unit_check,
+                                                                        a_type,
+                                                                        b_type,
+                                                                        c_type,
+                                                                        batch_count,
+                                                                        compute_type,
+                                                                        argus.fortran);
+    }
+    else if(a_type == HIPBLAS_R_8I && b_type == HIPBLAS_R_8I && c_type == HIPBLAS_R_32I
+            && c_type == HIPBLAS_R_32I && compute_type == HIPBLAS_R_32I)
+    {
+        status = testing_gemm_batched_ex_template<int8_t, int8_t, int32_t, int32_t>(transA,
+                                                                                    transB,
+                                                                                    M,
+                                                                                    N,
+                                                                                    K,
+                                                                                    alpha,
+                                                                                    lda,
+                                                                                    ldb,
+                                                                                    beta,
+                                                                                    ldc,
+                                                                                    norm_check,
+                                                                                    unit_check,
+                                                                                    a_type,
+                                                                                    b_type,
+                                                                                    c_type,
+                                                                                    batch_count,
+                                                                                    compute_type,
+                                                                                    argus.fortran);
+    }
     else
     {
         status = HIPBLAS_STATUS_NOT_SUPPORTED;
diff --git a/clients/include/testing_gemm_ex.hpp b/clients/include/testing_gemm_ex.hpp
index f9353c7ad..c98848935 100644
--- a/clients/include/testing_gemm_ex.hpp
+++ b/clients/include/testing_gemm_ex.hpp
@@ -24,7 +24,7 @@ using namespace std;
 
 /* ============================================================================================ */
 
-template <typename Td, typename Tc>
+template <typename Ta, typename Tb = Ta, typename Tc = Tb, typename Tex = Tc>
 hipblasStatus_t testing_gemm_ex_template(hipblasOperation_t transA,
                                          hipblasOperation_t transB,
                                          int                M,
@@ -51,46 +51,38 @@ hipblasStatus_t testing_gemm_ex_template(hipblasOperation_t transA,
     size_t*           workspace_size = 0;
     void*             workspace      = 0;
 
-    Td h_alpha_Td;
-    Td h_beta_Td;
+    Tex h_alpha_Tc;
+    Tex h_beta_Tc;
 
-    if(is_same<Td, hipblasHalf>::value)
+    if(is_same<Tex, hipblasHalf>::value)
     {
-        h_alpha_Td = float_to_half(alpha_float);
-        h_beta_Td  = float_to_half(beta_float);
+        h_alpha_Tc = float_to_half(alpha_float);
+        h_beta_Tc  = float_to_half(beta_float);
     }
-    else if(is_same<Td, float>::value)
+    else if(is_same<Tex, float>::value)
     {
-        h_alpha_Td = static_cast<Td>(alpha_float);
-        h_beta_Td  = static_cast<Td>(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-    else if(is_same<Td, double>::value)
+    else if(is_same<Tex, double>::value)
     {
-        h_alpha_Td = static_cast<Td>(alpha_float);
-        h_beta_Td  = static_cast<Td>(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-    else
+    else if(is_same<Tex, hipblasComplex>::value)
     {
-        return HIPBLAS_STATUS_NOT_SUPPORTED;
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-
-    Tc h_alpha_Tc;
-    Tc h_beta_Tc;
-
-    if(is_same<Tc, hipblasHalf>::value)
+    else if(is_same<Tex, hipblasDoubleComplex>::value)
     {
-        h_alpha_Tc = float_to_half(alpha_float);
-        h_beta_Tc  = float_to_half(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-    else if(is_same<Tc, float>::value)
+    else if(is_same<Tex, int32_t>::value)
     {
-        h_alpha_Tc = static_cast<Tc>(alpha_float);
-        h_beta_Tc  = static_cast<Tc>(beta_float);
-    }
-    else if(is_same<Tc, double>::value)
-    {
-        h_alpha_Tc = static_cast<Tc>(alpha_float);
-        h_beta_Tc  = static_cast<Tc>(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
     else
     {
@@ -132,15 +124,17 @@ hipblasStatus_t testing_gemm_ex_template(hipblasOperation_t transA,
     //  Tc* d_alpha_Tc = (Tc*)d_alpha_Tc_managed.get();
     //  Tc* d_beta_Tc  = (Tc*)d_beta_Tc_managed.get();
 
-    Td *dA, *dB, *dC;
-    Tc *d_alpha_Tc, *d_beta_Tc;
+    Ta*  dA;
+    Tb*  dB;
+    Tc*  dC;
+    Tex *d_alpha_Tc, *d_beta_Tc;
 
-    CHECK_HIP_ERROR(hipMalloc(&dA, size_A * sizeof(Td)));
-    CHECK_HIP_ERROR(hipMalloc(&dB, size_B * sizeof(Td)));
-    CHECK_HIP_ERROR(hipMalloc(&dC, size_C * sizeof(Td)));
+    CHECK_HIP_ERROR(hipMalloc(&dA, size_A * sizeof(Ta)));
+    CHECK_HIP_ERROR(hipMalloc(&dB, size_B * sizeof(Tb)));
+    CHECK_HIP_ERROR(hipMalloc(&dC, size_C * sizeof(Tc)));
 
-    CHECK_HIP_ERROR(hipMalloc(&d_alpha_Tc, sizeof(Td)));
-    CHECK_HIP_ERROR(hipMalloc(&d_beta_Tc, sizeof(Td)));
+    CHECK_HIP_ERROR(hipMalloc(&d_alpha_Tc, sizeof(Tex)));
+    CHECK_HIP_ERROR(hipMalloc(&d_beta_Tc, sizeof(Tex)));
 
     if(!dA || !dB || !dC || !d_alpha_Tc || !d_beta_Tc)
     {
@@ -153,16 +147,16 @@ hipblasStatus_t testing_gemm_ex_template(hipblasOperation_t transA,
     hipblasCreate(&handle);
 
     // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory
-    vector<Td> hA(size_A);
-    vector<Td> hB(size_B);
-    vector<Td> hC(size_C);
-    vector<Td> hC_gold(size_C);
+    vector<Ta> hA(size_A);
+    vector<Tb> hB(size_B);
+    vector<Tc> hC(size_C);
+    vector<Tc> hC_gold(size_C);
 
     // Initial Data on CPU
     srand(1);
-    hipblas_init<Td>(hA, A_row, A_col, lda);
-    hipblas_init_alternating_sign<Td>(hB, B_row, B_col, ldb);
-    hipblas_init<Td>(hC, M, N, ldc);
+    hipblas_init<Ta>(hA, A_row, A_col, lda);
+    hipblas_init_alternating_sign<Tb>(hB, B_row, B_col, ldb);
+    hipblas_init<Tc>(hC, M, N, ldc);
 
     //  if(is_same<Td, hipblasHalf>::value)
     //  {
@@ -192,9 +186,37 @@ hipblasStatus_t testing_gemm_ex_template(hipblasOperation_t transA,
     hC_gold = hC;
 
     // copy data from CPU to device
-    CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(Td) * size_A, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(Td) * size_B, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(dC, hC.data(), sizeof(Td) * size_C, hipMemcpyHostToDevice));
+
+    // CUDA doesn't do packing
+#ifdef __HIP_PLATFORM_NVCC__
+    CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(Ta) * size_A, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(Tb) * size_B, hipMemcpyHostToDevice));
+#else
+    if(std::is_same<Ta, int8_t>{} && transA == HIPBLAS_OP_N)
+    {
+        vector<Ta> hA_packed(hA);
+        hipblas_packInt8(hA_packed, M, K, lda);
+        CHECK_HIP_ERROR(
+            hipMemcpy(dA, hA_packed.data(), sizeof(Ta) * size_A, hipMemcpyHostToDevice));
+    }
+    else
+    {
+        CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(Ta) * size_A, hipMemcpyHostToDevice));
+    }
+
+    if(std::is_same<Tb, int8_t>{} && transB != HIPBLAS_OP_N)
+    {
+        vector<Tb> hB_packed(hB);
+        hipblas_packInt8(hB_packed, N, K, ldb);
+        CHECK_HIP_ERROR(
+            hipMemcpy(dB, hB_packed.data(), sizeof(Tb) * size_B, hipMemcpyHostToDevice));
+    }
+    else
+    {
+        CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(Tb) * size_B, hipMemcpyHostToDevice));
+    }
+#endif
+    CHECK_HIP_ERROR(hipMemcpy(dC, hC.data(), sizeof(Tc) * size_C, hipMemcpyHostToDevice));
 
     status = hipblasGemmExFn(handle,
                              transA,
@@ -229,7 +251,7 @@ hipblasStatus_t testing_gemm_ex_template(hipblasOperation_t transA,
         return status;
     }
 
-    CHECK_HIP_ERROR(hipMemcpy(hC.data(), dC, sizeof(Td) * size_C, hipMemcpyDeviceToHost));
+    CHECK_HIP_ERROR(hipMemcpy(hC.data(), dC, sizeof(Tc) * size_C, hipMemcpyDeviceToHost));
 
     //      std::cout << std::endl << "-----hD_1---------------------------------------" <<
     //      std::endl;
@@ -247,19 +269,19 @@ hipblasStatus_t testing_gemm_ex_template(hipblasOperation_t transA,
 
     // CPU BLAS
 
-    cblas_gemm<Td>(transA,
-                   transB,
-                   M,
-                   N,
-                   K,
-                   h_alpha_Td,
-                   hA.data(),
-                   lda,
-                   hB.data(),
-                   ldb,
-                   h_beta_Td,
-                   hC_gold.data(),
-                   ldc);
+    cblas_gemm<Ta, Tc, Tex>(transA,
+                            transB,
+                            M,
+                            N,
+                            K,
+                            h_alpha_Tc,
+                            hA.data(),
+                            lda,
+                            hB.data(),
+                            ldb,
+                            h_beta_Tc,
+                            hC_gold.data(),
+                            ldc);
 
     //      std::cout << std::endl << "---gold---gold---gold---------------------" << std::endl;
     //      if(is_same<Td, hipblasHalf>::value)
@@ -280,7 +302,7 @@ hipblasStatus_t testing_gemm_ex_template(hipblasOperation_t transA,
     // unit check and norm check can not be interchanged their order
     if(unit_check)
     {
-        unit_check_general<Td>(M, N, ldc, hC_gold.data(), hC.data());
+        unit_check_general<Tc>(M, N, ldc, hC_gold.data(), hC.data());
     }
 
     hipblasDestroy(handle);
@@ -323,70 +345,92 @@ hipblasStatus_t testing_gemm_ex(Arguments argus)
     if(a_type == HIPBLAS_R_16F && b_type == HIPBLAS_R_16F && c_type == HIPBLAS_R_16F
        && c_type == HIPBLAS_R_16F && compute_type == HIPBLAS_R_16F)
     {
-        status = testing_gemm_ex_template<hipblasHalf, hipblasHalf>(transA,
-                                                                    transB,
-                                                                    M,
-                                                                    N,
-                                                                    K,
-                                                                    alpha,
-                                                                    lda,
-                                                                    ldb,
-                                                                    beta,
-                                                                    ldc,
-                                                                    norm_check,
-                                                                    unit_check,
-                                                                    a_type,
-                                                                    b_type,
-                                                                    c_type,
-                                                                    compute_type,
-                                                                    argus.fortran);
+        status = testing_gemm_ex_template<hipblasHalf>(transA,
+                                                       transB,
+                                                       M,
+                                                       N,
+                                                       K,
+                                                       alpha,
+                                                       lda,
+                                                       ldb,
+                                                       beta,
+                                                       ldc,
+                                                       norm_check,
+                                                       unit_check,
+                                                       a_type,
+                                                       b_type,
+                                                       c_type,
+                                                       compute_type,
+                                                       argus.fortran);
     }
     else if(a_type == HIPBLAS_R_16F && b_type == HIPBLAS_R_16F && c_type == HIPBLAS_R_16F
             && c_type == HIPBLAS_R_16F && compute_type == HIPBLAS_R_32F)
     {
-        status = testing_gemm_ex_template<hipblasHalf, float>(transA,
-                                                              transB,
-                                                              M,
-                                                              N,
-                                                              K,
-                                                              alpha,
-                                                              lda,
-                                                              ldb,
-                                                              beta,
-                                                              ldc,
-                                                              norm_check,
-                                                              unit_check,
-                                                              a_type,
-                                                              b_type,
-                                                              c_type,
-                                                              compute_type,
-                                                              argus.fortran);
+        status
+            = testing_gemm_ex_template<hipblasHalf, hipblasHalf, hipblasHalf, float>(transA,
+                                                                                     transB,
+                                                                                     M,
+                                                                                     N,
+                                                                                     K,
+                                                                                     alpha,
+                                                                                     lda,
+                                                                                     ldb,
+                                                                                     beta,
+                                                                                     ldc,
+                                                                                     norm_check,
+                                                                                     unit_check,
+                                                                                     a_type,
+                                                                                     b_type,
+                                                                                     c_type,
+                                                                                     compute_type,
+                                                                                     argus.fortran);
     }
     else if(a_type == HIPBLAS_R_32F && b_type == HIPBLAS_R_32F && c_type == HIPBLAS_R_32F
             && c_type == HIPBLAS_R_32F && compute_type == HIPBLAS_R_32F)
     {
-        status = testing_gemm_ex_template<float, float>(transA,
-                                                        transB,
-                                                        M,
-                                                        N,
-                                                        K,
-                                                        alpha,
-                                                        lda,
-                                                        ldb,
-                                                        beta,
-                                                        ldc,
-                                                        norm_check,
-                                                        unit_check,
-                                                        a_type,
-                                                        b_type,
-                                                        c_type,
-                                                        compute_type,
-                                                        argus.fortran);
+        status = testing_gemm_ex_template<float>(transA,
+                                                 transB,
+                                                 M,
+                                                 N,
+                                                 K,
+                                                 alpha,
+                                                 lda,
+                                                 ldb,
+                                                 beta,
+                                                 ldc,
+                                                 norm_check,
+                                                 unit_check,
+                                                 a_type,
+                                                 b_type,
+                                                 c_type,
+                                                 compute_type,
+                                                 argus.fortran);
     }
     else if(a_type == HIPBLAS_R_64F && b_type == HIPBLAS_R_64F && c_type == HIPBLAS_R_64F
             && c_type == HIPBLAS_R_64F && compute_type == HIPBLAS_R_64F)
     {
-        status = testing_gemm_ex_template<double, double>(transA,
+        status = testing_gemm_ex_template<double>(transA,
+                                                  transB,
+                                                  M,
+                                                  N,
+                                                  K,
+                                                  alpha,
+                                                  lda,
+                                                  ldb,
+                                                  beta,
+                                                  ldc,
+                                                  norm_check,
+                                                  unit_check,
+                                                  a_type,
+                                                  b_type,
+                                                  c_type,
+                                                  compute_type,
+                                                  argus.fortran);
+    }
+    else if(a_type == HIPBLAS_C_32F && b_type == HIPBLAS_C_32F && c_type == HIPBLAS_C_32F
+            && c_type == HIPBLAS_C_32F && compute_type == HIPBLAS_C_32F)
+    {
+        status = testing_gemm_ex_template<hipblasComplex>(transA,
                                                           transB,
                                                           M,
                                                           N,
@@ -404,6 +448,48 @@ hipblasStatus_t testing_gemm_ex(Arguments argus)
                                                           compute_type,
                                                           argus.fortran);
     }
+    else if(a_type == HIPBLAS_C_64F && b_type == HIPBLAS_C_64F && c_type == HIPBLAS_C_64F
+            && c_type == HIPBLAS_C_64F && compute_type == HIPBLAS_C_64F)
+    {
+        status = testing_gemm_ex_template<hipblasDoubleComplex>(transA,
+                                                                transB,
+                                                                M,
+                                                                N,
+                                                                K,
+                                                                alpha,
+                                                                lda,
+                                                                ldb,
+                                                                beta,
+                                                                ldc,
+                                                                norm_check,
+                                                                unit_check,
+                                                                a_type,
+                                                                b_type,
+                                                                c_type,
+                                                                compute_type,
+                                                                argus.fortran);
+    }
+    else if(a_type == HIPBLAS_R_8I && b_type == HIPBLAS_R_8I && c_type == HIPBLAS_R_32I
+            && c_type == HIPBLAS_R_32I && compute_type == HIPBLAS_R_32I)
+    {
+        status = testing_gemm_ex_template<int8_t, int8_t, int32_t, int32_t>(transA,
+                                                                            transB,
+                                                                            M,
+                                                                            N,
+                                                                            K,
+                                                                            alpha,
+                                                                            lda,
+                                                                            ldb,
+                                                                            beta,
+                                                                            ldc,
+                                                                            norm_check,
+                                                                            unit_check,
+                                                                            a_type,
+                                                                            b_type,
+                                                                            c_type,
+                                                                            compute_type,
+                                                                            argus.fortran);
+    }
     else
     {
         status = HIPBLAS_STATUS_NOT_SUPPORTED;
diff --git a/clients/include/testing_gemm_strided_batched_ex.hpp b/clients/include/testing_gemm_strided_batched_ex.hpp
index 3998c1c7c..467526190 100644
--- a/clients/include/testing_gemm_strided_batched_ex.hpp
+++ b/clients/include/testing_gemm_strided_batched_ex.hpp
@@ -24,7 +24,7 @@ using namespace std;
 
 /* ============================================================================================ */
 
-template <typename Td, typename Tc>
+template <typename Ta, typename Tb = Ta, typename Tc = Tb, typename Tex = Tc>
 hipblasStatus_t testing_gemm_strided_batched_ex_template(hipblasOperation_t transA,
                                                          hipblasOperation_t transB,
                                                          int                M,
@@ -51,46 +51,38 @@ hipblasStatus_t testing_gemm_strided_batched_ex_template(hipblasOperation_t tran
     uint32_t          solution_index = 0;
     uint32_t          flags          = 0;
 
-    Td h_alpha_Td;
-    Td h_beta_Td;
+    Tex h_alpha_Tc;
+    Tex h_beta_Tc;
 
-    if(is_same<Td, hipblasHalf>::value)
+    if(is_same<Tex, hipblasHalf>::value)
     {
-        h_alpha_Td = float_to_half(alpha_float);
-        h_beta_Td  = float_to_half(beta_float);
+        h_alpha_Tc = float_to_half(alpha_float);
+        h_beta_Tc  = float_to_half(beta_float);
     }
-    else if(is_same<Td, float>::value)
+    else if(is_same<Tex, float>::value)
     {
-        h_alpha_Td = static_cast<Td>(alpha_float);
-        h_beta_Td  = static_cast<Td>(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-    else if(is_same<Td, double>::value)
+    else if(is_same<Tex, double>::value)
     {
-        h_alpha_Td = static_cast<Td>(alpha_float);
-        h_beta_Td  = static_cast<Td>(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-    else
-    {
-        return HIPBLAS_STATUS_NOT_SUPPORTED;
-    }
-
-    Tc h_alpha_Tc;
-    Tc h_beta_Tc;
-
-    if(is_same<Tc, hipblasHalf>::value)
+    else if(is_same<Tex, hipblasComplex>::value)
     {
-        h_alpha_Tc = float_to_half(alpha_float);
-        h_beta_Tc  = float_to_half(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-    else if(is_same<Tc, float>::value)
+    else if(is_same<Tex, hipblasDoubleComplex>::value)
     {
-        h_alpha_Tc = static_cast<Tc>(alpha_float);
-        h_beta_Tc  = static_cast<Tc>(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
-    else if(is_same<Tc, double>::value)
+    else if(is_same<Tex, int32_t>::value)
     {
-        h_alpha_Tc = static_cast<Tc>(alpha_float);
-        h_beta_Tc  = static_cast<Tc>(beta_float);
+        h_alpha_Tc = static_cast<Tex>(alpha_float);
+        h_beta_Tc  = static_cast<Tex>(beta_float);
     }
     else
     {
@@ -116,9 +108,9 @@ hipblasStatus_t testing_gemm_strided_batched_ex_template(hipblasOperation_t tran
     const size_t size_B = stride_B * batch_count;
     const size_t size_C = stride_C * batch_count;
 
-    device_vector<Td> dA(size_A);
-    device_vector<Td> dB(size_B);
-    device_vector<Td> dC(size_C);
+    device_vector<Ta> dA(size_A);
+    device_vector<Tb> dB(size_B);
+    device_vector<Tc> dC(size_C);
 
     device_vector<Tc> d_alpha_Tc(1);
     device_vector<Tc> d_beta_Tc(1);
@@ -134,22 +126,48 @@ hipblasStatus_t testing_gemm_strided_batched_ex_template(hipblasOperation_t tran
     hipblasCreate(&handle);
 
     // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory
-    host_vector<Td> hA(size_A);
-    host_vector<Td> hB(size_B);
-    host_vector<Td> hC(size_C);
-    host_vector<Td> hC_gold(size_C);
+    host_vector<Ta> hA(size_A);
+    host_vector<Tb> hB(size_B);
+    host_vector<Tc> hC(size_C);
+    host_vector<Tc> hC_gold(size_C);
 
     // Initial Data on CPU
     srand(1);
-    hipblas_init<Td>(hA, A_row, A_col, lda, stride_A, batch_count);
-    hipblas_init_alternating_sign<Td>(hB, B_row, B_col, ldb, stride_B, batch_count);
-    hipblas_init<Td>(hC, M, N, ldc, stride_C, batch_count);
+    hipblas_init<Ta>(hA, A_row, A_col, lda, stride_A, batch_count);
+    hipblas_init_alternating_sign<Tb>(hB, B_row, B_col, ldb, stride_B, batch_count);
+    hipblas_init<Tc>(hC, M, N, ldc, stride_C, batch_count);
     hC_gold = hC;
 
     // copy data from CPU to device
-    CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(Td) * size_A, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(Td) * size_B, hipMemcpyHostToDevice));
-    CHECK_HIP_ERROR(hipMemcpy(dC, hC.data(), sizeof(Td) * size_C, hipMemcpyHostToDevice));
+#ifdef __HIP_PLATFORM_NVCC__
+    CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(Ta) * size_A, hipMemcpyHostToDevice));
+    CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(Tb) * size_B, hipMemcpyHostToDevice));
+#else
+    if(std::is_same<Ta, int8_t>{} && transA == HIPBLAS_OP_N)
+    {
+        vector<Ta> hA_packed(hA);
+        hipblas_packInt8(hA_packed, M, K, lda, batch_count, stride_A);
+        CHECK_HIP_ERROR(
+            hipMemcpy(dA, hA_packed.data(), sizeof(Ta) * size_A, hipMemcpyHostToDevice));
+    }
+    else
+    {
+        CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(Ta) * size_A, hipMemcpyHostToDevice));
+    }
+
+    if(std::is_same<Tb, int8_t>{} && transB != HIPBLAS_OP_N)
+    {
+        vector<Tb> hB_packed(hB);
+        hipblas_packInt8(hB_packed, N, K, ldb, batch_count, stride_B);
+        CHECK_HIP_ERROR(
+            hipMemcpy(dB, hB_packed.data(), sizeof(Tb) * size_B, hipMemcpyHostToDevice));
+    }
+    else
+    {
+        CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(Tb) * size_B, hipMemcpyHostToDevice));
+    }
+#endif
+    CHECK_HIP_ERROR(hipMemcpy(dC, hC.data(), sizeof(Tc) * size_C, hipMemcpyHostToDevice));
 
     status = hipblasGemmStridedBatchedExFn(handle,
                                            transA,
@@ -181,24 +199,24 @@ hipblasStatus_t testing_gemm_strided_batched_ex_template(hipblasOperation_t tran
         return status;
     }
 
-    CHECK_HIP_ERROR(hipMemcpy(hC.data(), dC, sizeof(Td) * size_C, hipMemcpyDeviceToHost));
+    CHECK_HIP_ERROR(hipMemcpy(hC.data(), dC, sizeof(Tc) * size_C, hipMemcpyDeviceToHost));
 
     // CPU BLAS
     for(int b = 0; b < batch_count; b++)
     {
-        cblas_gemm<Td>(transA,
-                       transB,
-                       M,
-                       N,
-                       K,
-                       h_alpha_Td,
-                       hA.data() + b * stride_A,
-                       lda,
-                       hB.data() + b * stride_B,
-                       ldb,
-                       h_beta_Td,
-                       hC_gold.data() + b * stride_C,
-                       ldc);
+        cblas_gemm<Ta, Tc, Tex>(transA,
+                                transB,
+                                M,
+                                N,
+                                K,
+                                h_alpha_Tc,
+                                hA.data() + b * stride_A,
+                                lda,
+                                hB.data() + b * stride_B,
+                                ldb,
+                                h_beta_Tc,
+                                hC_gold.data() + b * stride_C,
+                                ldc);
     }
 
     // enable unit check, notice unit check is not invasive, but norm check is,
@@ -207,7 +225,7 @@ hipblasStatus_t testing_gemm_strided_batched_ex_template(hipblasOperation_t tran
     {
         for(int b = 0; b < batch_count; b++)
         {
-            unit_check_general<Td>(
+            unit_check_general<Tc>(
                 M, N, ldc, hC_gold.data() + b * stride_C, hC.data() + b * stride_C);
         }
     }
@@ -247,73 +265,98 @@ hipblasStatus_t testing_gemm_strided_batched_ex(Arguments argus)
     if(a_type == HIPBLAS_R_16F && b_type == HIPBLAS_R_16F && c_type == HIPBLAS_R_16F
        && c_type == HIPBLAS_R_16F && compute_type == HIPBLAS_R_16F)
     {
-        status = testing_gemm_strided_batched_ex_template<hipblasHalf, hipblasHalf>(transA,
-                                                                                    transB,
-                                                                                    M,
-                                                                                    N,
-                                                                                    K,
-                                                                                    alpha,
-                                                                                    lda,
-                                                                                    ldb,
-                                                                                    beta,
-                                                                                    ldc,
-                                                                                    norm_check,
-                                                                                    unit_check,
-                                                                                    a_type,
-                                                                                    b_type,
-                                                                                    c_type,
-                                                                                    batch_count,
-                                                                                    compute_type,
-                                                                                    argus.fortran);
+        status = testing_gemm_strided_batched_ex_template<hipblasHalf>(transA,
+                                                                       transB,
+                                                                       M,
+                                                                       N,
+                                                                       K,
+                                                                       alpha,
+                                                                       lda,
+                                                                       ldb,
+                                                                       beta,
+                                                                       ldc,
+                                                                       norm_check,
+                                                                       unit_check,
+                                                                       a_type,
+                                                                       b_type,
+                                                                       c_type,
+                                                                       batch_count,
+                                                                       compute_type,
+                                                                       argus.fortran);
     }
     else if(a_type == HIPBLAS_R_16F && b_type == HIPBLAS_R_16F && c_type == HIPBLAS_R_16F
             && c_type == HIPBLAS_R_16F && compute_type == HIPBLAS_R_32F)
     {
-        status = testing_gemm_strided_batched_ex_template<hipblasHalf, float>(transA,
-                                                                              transB,
-                                                                              M,
-                                                                              N,
-                                                                              K,
-                                                                              alpha,
-                                                                              lda,
-                                                                              ldb,
-                                                                              beta,
-                                                                              ldc,
-                                                                              norm_check,
-                                                                              unit_check,
-                                                                              a_type,
-                                                                              b_type,
-                                                                              c_type,
-                                                                              batch_count,
-                                                                              compute_type,
-                                                                              argus.fortran);
+        status = testing_gemm_strided_batched_ex_template<hipblasHalf,
+                                                          hipblasHalf,
+                                                          hipblasHalf,
+                                                          float>(transA,
+                                                                 transB,
+                                                                 M,
+                                                                 N,
+                                                                 K,
+                                                                 alpha,
+                                                                 lda,
+                                                                 ldb,
+                                                                 beta,
+                                                                 ldc,
+                                                                 norm_check,
+                                                                 unit_check,
+                                                                 a_type,
+                                                                 b_type,
+                                                                 c_type,
+                                                                 batch_count,
+                                                                 compute_type,
+                                                                 argus.fortran);
     }
     else if(a_type == HIPBLAS_R_32F && b_type == HIPBLAS_R_32F && c_type == HIPBLAS_R_32F
             && c_type == HIPBLAS_R_32F && compute_type == HIPBLAS_R_32F)
     {
-        status = testing_gemm_strided_batched_ex_template<float, float>(transA,
-                                                                        transB,
-                                                                        M,
-                                                                        N,
-                                                                        K,
-                                                                        alpha,
-                                                                        lda,
-                                                                        ldb,
-                                                                        beta,
-                                                                        ldc,
-                                                                        norm_check,
-                                                                        unit_check,
-                                                                        a_type,
-                                                                        b_type,
-                                                                        c_type,
-                                                                        batch_count,
-                                                                        compute_type,
-                                                                        argus.fortran);
+        status = testing_gemm_strided_batched_ex_template<float>(transA,
+                                                                 transB,
+                                                                 M,
+                                                                 N,
+                                                                 K,
+                                                                 alpha,
+                                                                 lda,
+                                                                 ldb,
+                                                                 beta,
+                                                                 ldc,
+                                                                 norm_check,
+                                                                 unit_check,
+                                                                 a_type,
+                                                                 b_type,
+                                                                 c_type,
+                                                                 batch_count,
+                                                                 compute_type,
+                                                                 argus.fortran);
     }
     else if(a_type == HIPBLAS_R_64F && b_type == HIPBLAS_R_64F && c_type == HIPBLAS_R_64F
             && c_type == HIPBLAS_R_64F && compute_type == HIPBLAS_R_64F)
     {
-        status = testing_gemm_strided_batched_ex_template<double, double>(transA,
+        status = testing_gemm_strided_batched_ex_template<double>(transA,
+                                                                  transB,
+                                                                  M,
+                                                                  N,
+                                                                  K,
+                                                                  alpha,
+                                                                  lda,
+                                                                  ldb,
+                                                                  beta,
+                                                                  ldc,
+                                                                  norm_check,
+                                                                  unit_check,
+                                                                  a_type,
+                                                                  b_type,
+                                                                  c_type,
+                                                                  batch_count,
+                                                                  compute_type,
+                                                                  argus.fortran);
+    }
+    else if(a_type == HIPBLAS_C_32F && b_type == HIPBLAS_C_32F && c_type == HIPBLAS_C_32F
+            && c_type == HIPBLAS_C_32F && compute_type == HIPBLAS_C_32F)
+    {
+        status = testing_gemm_strided_batched_ex_template<hipblasComplex>(transA,
                                                                           transB,
                                                                           M,
                                                                           N,
@@ -332,6 +375,51 @@ hipblasStatus_t testing_gemm_strided_batched_ex(Arguments argus)
                                                                           compute_type,
                                                                           argus.fortran);
     }
+    else if(a_type == HIPBLAS_C_64F && b_type == HIPBLAS_C_64F && c_type == HIPBLAS_C_64F
+            && c_type == HIPBLAS_C_64F && compute_type == HIPBLAS_C_64F)
+    {
+        status = testing_gemm_strided_batched_ex_template<hipblasDoubleComplex>(transA,
+                                                                                transB,
+                                                                                M,
+                                                                                N,
+                                                                                K,
+                                                                                alpha,
+                                                                                lda,
+                                                                                ldb,
+                                                                                beta,
+                                                                                ldc,
+                                                                                norm_check,
+                                                                                unit_check,
+                                                                                a_type,
+                                                                                b_type,
+                                                                                c_type,
+                                                                                batch_count,
+                                                                                compute_type,
+                                                                                argus.fortran);
+    }
+    else if(a_type == HIPBLAS_R_8I && b_type == HIPBLAS_R_8I && c_type == HIPBLAS_R_32I
+            && c_type == HIPBLAS_R_32I && compute_type == HIPBLAS_R_32I)
+    {
+        status = testing_gemm_strided_batched_ex_template<int8_t, int8_t, int32_t, int32_t>(
+            transA,
+            transB,
+            M,
+            N,
+            K,
+            alpha,
+            lda,
+            ldb,
+            beta,
+            ldc,
+            norm_check,
+            unit_check,
+            a_type,
+            b_type,
+            c_type,
+            batch_count,
+            compute_type,
+            argus.fortran);
+    }
     else
     {
         status = HIPBLAS_STATUS_NOT_SUPPORTED;
diff --git a/clients/include/testing_getrs.hpp b/clients/include/testing_getrs.hpp
index 82949f6a6..2ca73b8b0 100644
--- a/clients/include/testing_getrs.hpp
+++ b/clients/include/testing_getrs.hpp
@@ -78,7 +78,7 @@ hipblasStatus_t testing_getrs(Arguments argus)
 
     // Calculate hB = hA*hX;
     hipblasOperation_t op = HIPBLAS_OP_N;
-    cblas_gemm<T>(op, op, N, 1, N, 1, hA.data(), lda, hX.data(), ldb, 0, hB.data(), ldb);
+    cblas_gemm<T>(op, op, N, 1, N, (T)1, hA.data(), lda, hX.data(), ldb, (T)0, hB.data(), ldb);
 
     // LU factorize hA on the CPU
     info = cblas_getrf<T>(N, N, hA.data(), lda, hIpiv.data());
diff --git a/clients/include/testing_getrs_batched.hpp b/clients/include/testing_getrs_batched.hpp
index 8e09b3b90..7085e0c67 100644
--- a/clients/include/testing_getrs_batched.hpp
+++ b/clients/include/testing_getrs_batched.hpp
@@ -96,7 +96,7 @@ hipblasStatus_t testing_getrs_batched(Arguments argus)
 
         // Calculate hB = hA*hX;
         cblas_gemm<T>(
-            op, op, N, 1, N, 1, hA[b].data(), lda, hX[b].data(), ldb, 0, hB[b].data(), ldb);
+            op, op, N, 1, N, (T)1, hA[b].data(), lda, hX[b].data(), ldb, (T)0, hB[b].data(), ldb);
 
         // LU factorize hA on the CPU
         info = cblas_getrf<T>(N, N, hA[b].data(), lda, hIpiv.data() + b * strideP);
diff --git a/clients/include/testing_getrs_strided_batched.hpp b/clients/include/testing_getrs_strided_batched.hpp
index 539aab084..c1625571e 100644
--- a/clients/include/testing_getrs_strided_batched.hpp
+++ b/clients/include/testing_getrs_strided_batched.hpp
@@ -95,7 +95,7 @@ hipblasStatus_t testing_getrs_strided_batched(Arguments argus)
         }
 
         // Calculate hB = hA*hX;
-        cblas_gemm<T>(op, op, N, 1, N, 1, hAb, lda, hXb, ldb, 0, hBb, ldb);
+        cblas_gemm<T>(op, op, N, 1, N, (T)1, hAb, lda, hXb, ldb, (T)0, hBb, ldb);
 
         // LU factorize hA on the CPU
         info = cblas_getrf<T>(N, N, hAb, lda, hIpivb);
diff --git a/clients/include/testing_tpsv.hpp b/clients/include/testing_tpsv.hpp
index 8254617fe..81f4959f9 100644
--- a/clients/include/testing_tpsv.hpp
+++ b/clients/include/testing_tpsv.hpp
@@ -72,8 +72,19 @@ hipblasStatus_t testing_tpsv(Arguments argus)
     hipblas_init<T>(hA, N, N, 1);
 
     //  calculate AAT = hA * hA ^ T
-    cblas_gemm<T>(
-        HIPBLAS_OP_N, HIPBLAS_OP_T, N, N, N, 1.0, hA.data(), N, hA.data(), N, 0.0, AAT.data(), N);
+    cblas_gemm<T>(HIPBLAS_OP_N,
+                  HIPBLAS_OP_T,
+                  N,
+                  N,
+                  N,
+                  (T)1.0,
+                  hA.data(),
+                  N,
+                  hA.data(),
+                  N,
+                  (T)0.0,
+                  AAT.data(),
+                  N);
 
     //  copy AAT into hA, make hA strictly diagonal dominant, and therefore SPD
     for(int i = 0; i < N; i++)
diff --git a/clients/include/testing_tpsv_batched.hpp b/clients/include/testing_tpsv_batched.hpp
index 735c4d9d8..682a6b73c 100644
--- a/clients/include/testing_tpsv_batched.hpp
+++ b/clients/include/testing_tpsv_batched.hpp
@@ -99,7 +99,19 @@ hipblasStatus_t testing_tpsv_batched(Arguments argus)
         hipblas_init<T>(hA[b], N, N, N);
 
         //  calculate AAT = hA * hA ^ T
-        cblas_gemm<T>(HIPBLAS_OP_N, HIPBLAS_OP_T, N, N, N, 1.0, hA[b], N, hA[b], N, 0.0, AAT[b], N);
+        cblas_gemm<T>(HIPBLAS_OP_N,
+                      HIPBLAS_OP_T,
+                      N,
+                      N,
+                      N,
+                      (T)1.0,
+                      (T*)hA[b],
+                      N,
+                      (T*)hA[b],
+                      N,
+                      (T)0.0,
+                      (T*)AAT[b],
+                      N);
 
         //  copy AAT into hA, make hA strictly diagonal dominant, and therefore SPD
         for(int i = 0; i < N; i++)
diff --git a/clients/include/testing_tpsv_strided_batched.hpp b/clients/include/testing_tpsv_strided_batched.hpp
index 157de0d8d..9a6add631 100644
--- a/clients/include/testing_tpsv_strided_batched.hpp
+++ b/clients/include/testing_tpsv_strided_batched.hpp
@@ -87,7 +87,7 @@ hipblasStatus_t testing_tpsv_strided_batched(Arguments argus)
         T* AATb = AAT.data() + b * strideA;
         T* hbb  = hb.data() + b * stridex;
         //  calculate AAT = hA * hA ^ T
-        cblas_gemm<T>(HIPBLAS_OP_N, HIPBLAS_OP_T, N, N, N, 1.0, hAb, N, hAb, N, 0.0, AATb, N);
+        cblas_gemm<T>(HIPBLAS_OP_N, HIPBLAS_OP_T, N, N, N, (T)1.0, hAb, N, hAb, N, (T)0.0, AATb, N);
 
         //  copy AAT into hA, make hA strictly diagonal dominant, and therefore SPD
         for(int i = 0; i < N; i++)
diff --git a/clients/include/testing_trsv.hpp b/clients/include/testing_trsv.hpp
index 09c4daf5b..986982682 100644
--- a/clients/include/testing_trsv.hpp
+++ b/clients/include/testing_trsv.hpp
@@ -87,12 +87,12 @@ hipblasStatus_t testing_trsv(Arguments argus)
                   M,
                   M,
                   M,
-                  1.0,
+                  (T)1.0,
                   hA.data(),
                   lda,
                   hA.data(),
                   lda,
-                  0.0,
+                  (T)0.0,
                   AAT.data(),
                   lda);
 
diff --git a/clients/include/testing_trsv_batched.hpp b/clients/include/testing_trsv_batched.hpp
index 023809417..dabca1d31 100644
--- a/clients/include/testing_trsv_batched.hpp
+++ b/clients/include/testing_trsv_batched.hpp
@@ -97,8 +97,19 @@ hipblasStatus_t testing_trsv_batched(Arguments argus)
         hipblas_init<T>(hA[b], M, M, lda);
 
         //  calculate AAT = hA * hA ^ T
-        cblas_gemm<T>(
-            HIPBLAS_OP_N, HIPBLAS_OP_T, M, M, M, 1.0, hA[b], lda, hA[b], lda, 0.0, AAT[b], lda);
+        cblas_gemm<T>(HIPBLAS_OP_N,
+                      HIPBLAS_OP_T,
+                      M,
+                      M,
+                      M,
+                      (T)1.0,
+                      (T*)hA[b],
+                      lda,
+                      (T*)hA[b],
+                      lda,
+                      (T)0.0,
+                      (T*)AAT[b],
+                      lda);
 
         //  copy AAT into hA, make hA strictly diagonal dominant, and therefore SPD
         for(int i = 0; i < M; i++)
diff --git a/clients/include/testing_trsv_strided_batched.hpp b/clients/include/testing_trsv_strided_batched.hpp
index b11d5b2a2..dbe15ea57 100644
--- a/clients/include/testing_trsv_strided_batched.hpp
+++ b/clients/include/testing_trsv_strided_batched.hpp
@@ -87,7 +87,8 @@ hipblasStatus_t testing_trsv_strided_batched(Arguments argus)
         T* AATb = AAT.data() + b * strideA;
         T* hbb  = hb.data() + b * stridex;
         //  calculate AAT = hA * hA ^ T
-        cblas_gemm<T>(HIPBLAS_OP_N, HIPBLAS_OP_T, M, M, M, 1.0, hAb, lda, hAb, lda, 0.0, AATb, lda);
+        cblas_gemm<T>(
+            HIPBLAS_OP_N, HIPBLAS_OP_T, M, M, M, (T)1.0, hAb, lda, hAb, lda, (T)0.0, AATb, lda);
 
         //  copy AAT into hA, make hA strictly diagonal dominant, and therefore SPD
         for(int i = 0; i < M; i++)
diff --git a/clients/include/utility.h b/clients/include/utility.h
index a605ecf35..f0ecc437f 100644
--- a/clients/include/utility.h
+++ b/clients/include/utility.h
@@ -328,6 +328,23 @@ inline hipblasDoubleComplex random_generator_negative<hipblasDoubleComplex>()
 
 /* ============================================================================================ */
 
+/* ============================================================================================ */
+/*! \brief Packs strided_batched matricies into groups of 4 in N */
+template <typename T>
+void hipblas_packInt8(
+    std::vector<T>& A, size_t M, size_t N, size_t lda, size_t batch_count = 1, size_t stride_a = 0)
+{
+    std::vector<T> temp(A);
+    for(size_t b = 0; b < batch_count; b++)
+        for(size_t colBase = 0; colBase < N; colBase += 4)
+            for(size_t row = 0; row < lda; row++)
+                for(size_t colOffset = 0; colOffset < 4; colOffset++)
+                    A[(colBase * lda + 4 * row) + colOffset + (stride_a * b)]
+                        = temp[(colBase + colOffset) * lda + row + (stride_a * b)];
+}
+
+/* ============================================================================================ */
+
 /* ============================================================================================ */
 /*! \brief  matrix/vector initialization: */
 // for vector x (M=1, N=lengthX, lda=incx);
diff --git a/library/src/hcc_detail/hipblas.cpp b/library/src/hcc_detail/hipblas.cpp
index 6fe3b883b..945035cd6 100644
--- a/library/src/hcc_detail/hipblas.cpp
+++ b/library/src/hcc_detail/hipblas.cpp
@@ -199,6 +199,12 @@ rocblas_datatype HIPDatatypeToRocblasDatatype(hipblasDatatype_t type)
     case HIPBLAS_R_64F:
         return rocblas_datatype_f64_r;
 
+    case HIPBLAS_R_8I:
+        return rocblas_datatype_i8_r;
+
+    case HIPBLAS_R_32I:
+        return rocblas_datatype_i32_r;
+
     case HIPBLAS_C_16F:
         return rocblas_datatype_f16_c;
 
@@ -224,6 +230,12 @@ hipblasDatatype_t RocblasDatatypeToHIPDatatype(rocblas_datatype type)
     case rocblas_datatype_f64_r:
         return HIPBLAS_R_64F;
 
+    case rocblas_datatype_i8_r:
+        return HIPBLAS_R_8I;
+
+    case rocblas_datatype_i32_r:
+        return HIPBLAS_R_32I;
+
     case rocblas_datatype_f16_c:
         return HIPBLAS_C_16F;
 
diff --git a/library/src/nvcc_detail/hipblas.cpp b/library/src/nvcc_detail/hipblas.cpp
index e3dfd58f0..4832358ab 100644
--- a/library/src/nvcc_detail/hipblas.cpp
+++ b/library/src/nvcc_detail/hipblas.cpp
@@ -169,6 +169,12 @@ cudaDataType_t HIPDatatypeToCudaDatatype(hipblasDatatype_t type)
     case HIPBLAS_R_64F:
         return CUDA_R_64F;
 
+    case HIPBLAS_R_8I:
+        return CUDA_R_8I;
+
+    case HIPBLAS_R_32I:
+        return CUDA_R_32I;
+
     case HIPBLAS_C_16F:
         return CUDA_C_16F;