From e0e0e5e24db22646a8c95def5245c453c68c8061 Mon Sep 17 00:00:00 2001 From: Torre Zuk <42548444+TorreZuk@users.noreply.github.com> Date: Thu, 11 Mar 2021 16:23:27 -0700 Subject: [PATCH 1/8] gemv use localhandle and fix error macros (#316) --- clients/gtest/CMakeLists.txt | 3 +- clients/gtest/auxiliary_gtest.cpp | 23 +++++ clients/gtest/set_get_atomics_mode_gtest.cpp | 4 +- clients/gtest/set_get_vector_gtest.cpp | 4 +- clients/include/testing_gemv_batched.hpp | 83 ++++++----------- .../include/testing_gemv_strided_batched.hpp | 92 ++++++++----------- clients/include/utility.h | 48 ++++++---- 7 files changed, 124 insertions(+), 133 deletions(-) create mode 100644 clients/gtest/auxiliary_gtest.cpp diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index d53743705..6a54e25a3 100644 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -1,5 +1,5 @@ # ######################################################################## -# Copyright 2016-2020 Advanced Micro Devices, Inc. +# Copyright 2016-2021 Advanced Micro Devices, Inc. # ######################################################################## # set( Boost_DEBUG ON ) @@ -39,6 +39,7 @@ include_directories(${GTEST_INCLUDE_DIRS}) set(hipblas_test_source hipblas_gtest_main.cpp + auxiliary_gtest.cpp set_get_pointer_mode_gtest.cpp set_get_vector_gtest.cpp set_get_matrix_gtest.cpp diff --git a/clients/gtest/auxiliary_gtest.cpp b/clients/gtest/auxiliary_gtest.cpp new file mode 100644 index 000000000..399cb1714 --- /dev/null +++ b/clients/gtest/auxiliary_gtest.cpp @@ -0,0 +1,23 @@ +/* ************************************************************************ + * Copyright 2016-2021 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ +#pragma once + +#include "utility.h" +#include +#include +#include +#include + +namespace +{ + + TEST(hipblas_auxiliary, statusToString) + { + EXPECT_EQ(0, + strcmp("HIPBLAS_STATUS_ALLOC_FAILED", + hipblasStatusToString(HIPBLAS_STATUS_ALLOC_FAILED))); + } + +} // namespace diff --git a/clients/gtest/set_get_atomics_mode_gtest.cpp b/clients/gtest/set_get_atomics_mode_gtest.cpp index 442ced0e8..2dea1934b 100644 --- a/clients/gtest/set_get_atomics_mode_gtest.cpp +++ b/clients/gtest/set_get_atomics_mode_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -54,6 +54,6 @@ TEST_P(set_get_atomics_mode_gtest, default) EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } -INSTANTIATE_TEST_SUITE_P(rocblas_auxiliary_small, +INSTANTIATE_TEST_SUITE_P(hipblas_auxiliary_small, set_get_atomics_mode_gtest, Combine(ValuesIn(is_fortran))); diff --git a/clients/gtest/set_get_vector_gtest.cpp b/clients/gtest/set_get_vector_gtest.cpp index 1b287c573..f4c40beb0 100644 --- a/clients/gtest/set_get_vector_gtest.cpp +++ b/clients/gtest/set_get_vector_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -158,7 +158,7 @@ TEST_P(set_vector_get_vector_gtest, async_float) // ValuesIn take each element (a vector) and combine them and feed them to test_p // The combinations are { {M, N, lda}, {incx,incy} {alpha} } -INSTANTIATE_TEST_SUITE_P(rocblas_auxiliary_small, +INSTANTIATE_TEST_SUITE_P(hipblas_auxiliary_small, set_vector_get_vector_gtest, Combine(ValuesIn(M_range), ValuesIn(incx_incy_incd_range), diff --git a/clients/include/testing_gemv_batched.hpp b/clients/include/testing_gemv_batched.hpp index b379ac1ce..a436e659b 100644 --- a/clients/include/testing_gemv_batched.hpp +++ b/clients/include/testing_gemv_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -63,8 +63,7 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) return HIPBLAS_STATUS_SUCCESS; } - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); double gpu_time_used, cpu_time_used; double hipblasGflops, cblas_gflops, hipblasBandwidth; @@ -93,7 +92,6 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) if(!dA_array || !dx_array || !dy_array || (!bA_array[last] && A_size) || (!bx_array[last] && X_size) || (!by_array[last] && Y_size)) { - hipblasDestroy(handle); return HIPBLAS_STATUS_ALLOC_FAILED; } @@ -120,7 +118,6 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) { - hipblasDestroy(handle); return HIPBLAS_STATUS_MAPPING_ERROR; } } @@ -130,7 +127,6 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) err_y = hipMemcpy(dy_array, by_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) { - hipblasDestroy(handle); return HIPBLAS_STATUS_MAPPING_ERROR; } @@ -139,26 +135,19 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) =================================================================== */ if(argus.unit_check || argus.norm_check) { - status = hipblasGemvBatchedFn(handle, - transA, - M, - N, - (T*)&alpha, - dA_array, - lda, - dx_array, - incx, - (T*)&beta, - dy_array, - incy, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGemvBatchedFn(handle, + transA, + M, + N, + (T*)&alpha, + dA_array, + lda, + dx_array, + incx, + (T*)&beta, + dy_array, + incy, + batch_count)); /* ===================================================================== CPU BLAS @@ -192,12 +181,8 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) { @@ -205,26 +190,19 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) { gpu_time_used = get_time_us_sync(stream); } - status = hipblasGemvBatchedFn(handle, - transA, - M, - N, - (T*)&alpha, - dA_array, - lda, - dx_array, - incx, - (T*)&beta, - dy_array, - incy, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGemvBatchedFn(handle, + transA, + M, + N, + (T*)&alpha, + dA_array, + lda, + dx_array, + incx, + (T*)&beta, + dy_array, + incy, + batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -245,6 +223,5 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) rocblas_error); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_gemv_strided_batched.hpp b/clients/include/testing_gemv_strided_batched.hpp index 37e15b640..f02774530 100644 --- a/clients/include/testing_gemv_strided_batched.hpp +++ b/clients/include/testing_gemv_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -84,8 +84,7 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) T alpha = (T)argus.alpha; T beta = (T)argus.beta; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -107,29 +106,22 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) HIPBLAS =================================================================== */ - status = hipblasGemvStridedBatchedFn(handle, - transA, - M, - N, - (T*)&alpha, - dA, - lda, - stride_A, - dx, - incx, - stride_x, - (T*)&beta, - dy, - incy, - stride_y, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGemvStridedBatchedFn(handle, + transA, + M, + N, + (T*)&alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + (T*)&beta, + dy, + incy, + stride_y, + batch_count)); /* ===================================================================== CPU BLAS @@ -168,12 +160,8 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) { @@ -181,29 +169,22 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) { gpu_time_used = get_time_us_sync(stream); } - status = hipblasGemvStridedBatchedFn(handle, - transA, - M, - N, - (T*)&alpha, - dA, - lda, - stride_A, - dx, - incx, - stride_x, - (T*)&beta, - dy, - incy, - stride_y, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGemvStridedBatchedFn(handle, + transA, + M, + N, + (T*)&alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + (T*)&beta, + dy, + incy, + stride_y, + batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -228,6 +209,5 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) rocblas_error); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/utility.h b/clients/include/utility.h index b8c5b15c4..f2f1f0eaa 100644 --- a/clients/include/utility.h +++ b/clients/include/utility.h @@ -28,30 +28,40 @@ * \brief provide data initialization, timing, hipblas type <-> lapack char conversion utilities. */ -#define CHECK_HIP_ERROR(error) \ - do \ - { \ - if(error != hipSuccess) \ - { \ - fprintf(stderr, \ - "error: '%s'(%d) at %s:%d\n", \ - hipGetErrorString(error), \ - error, \ - __FILE__, \ - __LINE__); \ - exit(EXIT_FAILURE); \ - } \ +#define CHECK_HIP_ERROR(error) \ + do \ + { \ + hipError_t error__ = (error); \ + if(error__ != hipSuccess) \ + { \ + fprintf(stderr, \ + "hip error: '%s'(%d) at %s:%d\n", \ + hipGetErrorString(error__), \ + error__, \ + __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } \ } while(0) #ifdef __cplusplus #ifndef CHECK_HIPBLAS_ERROR -#define CHECK_HIPBLAS_ERROR(error) \ - if(error != HIPBLAS_STATUS_SUCCESS) \ - { \ - fprintf(stderr, "hipBLAS error: %s\n", hipblasStatusToString(error)); \ - return (error); \ - } +#define EXPECT_HIPBLAS_STATUS(status, expected) \ + do \ + { \ + hipblasStatus_t status__ = (status); \ + if(status__ != expected) \ + { \ + fprintf(stderr, \ + "hipBLAS error: %s at %s:%d\n", \ + hipblasStatusToString(status__), \ + __FILE__, \ + __LINE__); \ + return (status__); \ + } \ + } while(0) +#define CHECK_HIPBLAS_ERROR(STATUS) EXPECT_HIPBLAS_STATUS(STATUS, HIPBLAS_STATUS_SUCCESS) #endif #define BLAS_1_RESULT_PRINT \ From f46b70d55bae04b7fe86005094fc87396a1e5304 Mon Sep 17 00:00:00 2001 From: Daine McNiven <51674140+daineAMD@users.noreply.github.com> Date: Fri, 12 Mar 2021 11:02:27 -0700 Subject: [PATCH 2/8] Adding iamax, iamin, and nrm2 to hipblas-bench (#317) --- clients/benchmarks/client.cpp | 36 ++-- clients/gtest/blas1_gtest.cpp | 12 +- clients/include/bytes.hpp | 7 + clients/include/flops.hpp | 7 + clients/include/testing_iamax.hpp | 124 ------------ clients/include/testing_iamax_iamin.hpp | 149 +++++++------- .../include/testing_iamax_iamin_batched.hpp | 186 ++++++++---------- .../testing_iamax_iamin_strided_batched.hpp | 159 ++++++++------- clients/include/testing_nrm2.hpp | 107 +++++----- clients/include/testing_nrm2_batched.hpp | 127 ++++++------ .../include/testing_nrm2_strided_batched.hpp | 115 ++++++----- 11 files changed, 460 insertions(+), 569 deletions(-) delete mode 100644 clients/include/testing_iamax.hpp diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 3016aecdc..945254c67 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -330,6 +330,15 @@ struct perf_blas{} || std::is_same {"dot", testing_dot}, {"dot_batched", testing_dot_batched}, {"dot_strided_batched", testing_dot_strided_batched}, + {"iamax", testing_amax}, + {"iamax_batched", testing_amax_batched}, + {"iamax_strided_batched", testing_amax_strided_batched}, + {"iamin", testing_amin}, + {"iamin_batched", testing_amin_batched}, + {"iamin_strided_batched", testing_amin_strided_batched}, + {"nrm2", testing_nrm2}, + {"nrm2_batched", testing_nrm2_batched}, + {"nrm2_strided_batched", testing_nrm2_strided_batched}, {"swap", testing_swap}, {"swap_batched", testing_swap_batched}, {"swap_strided_batched", testing_swap_strided_batched}, @@ -340,15 +349,6 @@ struct perf_blas{} || std::is_same {"set_get_matrix", testing_set_get_matrix}, {"set_get_matrix_async", testing_set_get_matrix_async}, // L1 - {"iamax", testing_iamax}, - {"iamax_batched", testing_iamax_batched}, - {"iamax_strided_batched", testing_iamax_strided_batched}, - {"iamin", testing_iamin}, - {"iamin_batched", testing_iamin_batched}, - {"iamin_strided_batched", testing_iamin_strided_batched}, - {"nrm2", testing_nrm2}, - {"nrm2_batched", testing_nrm2_batched}, - {"nrm2_strided_batched", testing_nrm2_strided_batched}, {"rotm", testing_rotm}, {"rotm_batched", testing_rotm_batched}, {"rotm_strided_batched", testing_rotm_strided_batched}, @@ -508,6 +508,15 @@ struct perf_blas< {"dotc", testing_dotc}, {"dotc_batched", testing_dotc_batched}, {"dotc_strided_batched", testing_dotc_strided_batched}, + {"iamax", testing_amax}, + {"iamax_batched", testing_amax_batched}, + {"iamax_strided_batched", testing_amax_strided_batched}, + {"iamin", testing_amin}, + {"iamin_batched", testing_amin_batched}, + {"iamin_strided_batched", testing_amin_strided_batched}, + {"nrm2", testing_nrm2}, + {"nrm2_batched", testing_nrm2_batched}, + {"nrm2_strided_batched", testing_nrm2_strided_batched}, {"swap", testing_swap}, {"swap_batched", testing_swap_batched}, {"swap_strided_batched", testing_swap_strided_batched}, @@ -515,15 +524,6 @@ struct perf_blas< {"scal_batched", testing_scal_batched}, {"scal_strided_batched", testing_scal_strided_batched}, /* - {"iamax", testing_iamax}, - {"iamax_batched", testing_iamax_batched}, - {"iamax_strided_batched", testing_iamax_strided_batched}, - {"iamin", testing_iamin}, - {"iamin_batched", testing_iamin_batched}, - {"iamin_strided_batched", testing_iamin_strided_batched}, - {"nrm2", testing_nrm2}, - {"nrm2_batched", testing_nrm2_batched}, - {"nrm2_strided_batched", testing_nrm2_strided_batched}, // L2 {"gbmv", testing_gbmv}, {"gbmv_batched", testing_gbmv_batched}, diff --git a/clients/gtest/blas1_gtest.cpp b/clients/gtest/blas1_gtest.cpp index c2c56d49d..1225da1e3 100644 --- a/clients/gtest/blas1_gtest.cpp +++ b/clients/gtest/blas1_gtest.cpp @@ -1382,7 +1382,7 @@ TEST_P(blas1_gtest, nrm2_float) // The Arguments data struture have physical meaning associated. // while the tuple is non-intuitive. Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_nrm2(arg); + hipblasStatus_t status = testing_nrm2(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -1408,7 +1408,7 @@ TEST_P(blas1_gtest, nrm2_float_complex) // The Arguments data struture have physical meaning associated. // while the tuple is non-intuitive. Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_nrm2(arg); + hipblasStatus_t status = testing_nrm2(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -1435,7 +1435,7 @@ TEST_P(blas1_gtest, nrm2_batched_float) // The Arguments data struture have physical meaning associated. // while the tuple is non-intuitive. Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_nrm2_batched(arg); + hipblasStatus_t status = testing_nrm2_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -1465,7 +1465,7 @@ TEST_P(blas1_gtest, nrm2_batched_float_complex) // The Arguments data struture have physical meaning associated. // while the tuple is non-intuitive. Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_nrm2_batched(arg); + hipblasStatus_t status = testing_nrm2_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -1496,7 +1496,7 @@ TEST_P(blas1_gtest, nrm2_strided_batched_float) // The Arguments data struture have physical meaning associated. // while the tuple is non-intuitive. Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_nrm2_strided_batched(arg); + hipblasStatus_t status = testing_nrm2_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -1526,7 +1526,7 @@ TEST_P(blas1_gtest, nrm2_strided_batched_float_complex) // The Arguments data struture have physical meaning associated. // while the tuple is non-intuitive. Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_nrm2_strided_batched(arg); + hipblasStatus_t status = testing_nrm2_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { diff --git a/clients/include/bytes.hpp b/clients/include/bytes.hpp index b9993675e..1333ba4da 100644 --- a/clients/include/bytes.hpp +++ b/clients/include/bytes.hpp @@ -61,6 +61,13 @@ constexpr double dot_gbyte_count(int n) return (sizeof(T) * 2.0 * n) / 1e9; } +/* \brief byte counts of iamax/iamin */ +template +constexpr double iamax_gbyte_count(int n) +{ + return (sizeof(T) * 2.0 * n) / 1e9; +} + /* \brief byte counts of NRM2 */ template constexpr double nrm2_gbyte_count(int n) diff --git a/clients/include/flops.hpp b/clients/include/flops.hpp index fb5ee1694..addd27b75 100644 --- a/clients/include/flops.hpp +++ b/clients/include/flops.hpp @@ -93,6 +93,13 @@ constexpr double dot_gflop_count(int n) return (9.0 * n) / 1e9; } +// iamax/iamin +template +constexpr double iamax_gflop_count(int n) +{ + return (1.0 * n) / 1e9; +} + // nrm2 template constexpr double nrm2_gflop_count(int n) diff --git a/clients/include/testing_iamax.hpp b/clients/include/testing_iamax.hpp deleted file mode 100644 index 4ebc59be7..000000000 --- a/clients/include/testing_iamax.hpp +++ /dev/null @@ -1,124 +0,0 @@ -/* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. - * - * ************************************************************************ */ - -#include -#include -#include - -#include "testing_common.hpp" - -using namespace std; - -/* ============================================================================================ */ - -template -hipblasStatus_t testing_amax(const Arguments& argus) -{ - bool FORTRAN = argus.fortran; - auto hipblasIamaxFn = FORTRAN ? hipblasIamax : hipblasIamax; - - int N = argus.N; - int incx = argus.incx; - - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - - hipblasHandle_t handle; - hipblasCreate(&handle); - - T* dx; - int* d_rocblas_result; - - int cpu_result, rocblas_result1, rocblas_result2; - int zero = 0; - - // check to prevent undefined memory allocation error - if(N < 1 || incx <= 0) - { - CHECK_HIP_ERROR(hipMalloc(&dx, 100 * sizeof(T))); - CHECK_HIP_ERROR(hipMalloc(&d_rocblas_result, sizeof(int))); - - status_1 = hipblasIamaxFn(handle, N, dx, incx, &rocblas_result1); - - unit_check_general(1, 1, 1, &zero, &rocblas_result1); - } - else - { - int sizeX = N * incx; - - // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this - // practice - vector hx(sizeX); - - // allocate memory on device - CHECK_HIP_ERROR(hipMalloc(&dx, sizeX * sizeof(T))); - CHECK_HIP_ERROR(hipMalloc(&d_rocblas_result, sizeof(int))); - - // Initial Data on CPU - srand(1); - hipblas_init(hx, 1, N, incx); - - // copy data from CPU to device, does not work for incx != 1 - CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); - - /* ===================================================================== - HIP BLAS - =================================================================== */ - // device_pointer for d_rocblas_result - { - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - status_1 = hipblasIamaxFn(handle, N, dx, incx, d_rocblas_result); - - CHECK_HIP_ERROR( - hipMemcpy(&rocblas_result1, d_rocblas_result, sizeof(int), hipMemcpyDeviceToHost)); - } - // host_pointer for rocblas_result2 - if((status_1 == HIPBLAS_STATUS_SUCCESS) && (status_3 == HIPBLAS_STATUS_SUCCESS)) - { - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - - status_2 = hipblasIamaxFn(handle, N, dx, incx, &rocblas_result2); - } - - if((status_1 == HIPBLAS_STATUS_SUCCESS) && (status_2 == HIPBLAS_STATUS_SUCCESS) - && (status_3 == HIPBLAS_STATUS_SUCCESS)) - { - /* ===================================================================== - CPU BLAS - =================================================================== */ - cblas_iamax(N, hx.data(), incx, &cpu_result); - // change to Fortran 1 based indexing as in BLAS standard, not cblas zero based indexing - cpu_result += 1; - - unit_check_general(1, 1, 1, &cpu_result, &rocblas_result1); - unit_check_general(1, 1, 1, &cpu_result, &rocblas_result2); - - } // end of if unit/norm check - } - - CHECK_HIP_ERROR(hipFree(dx)); - CHECK_HIP_ERROR(hipFree(d_rocblas_result)); - hipblasDestroy(handle); - - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - return status_1; - } - else if(status_2 != HIPBLAS_STATUS_SUCCESS) - { - return status_2; - } - else if(status_3 != HIPBLAS_STATUS_SUCCESS) - { - return status_3; - } - else - { - return HIPBLAS_STATUS_SUCCESS; - } -} diff --git a/clients/include/testing_iamax_iamin.hpp b/clients/include/testing_iamax_iamin.hpp index dff52b469..149899307 100644 --- a/clients/include/testing_iamax_iamin.hpp +++ b/clients/include/testing_iamax_iamin.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -21,105 +21,102 @@ hipblasStatus_t testing_iamax_iamin(const Arguments& argus, hipblas_iamax_iamin_ int N = argus.N; int incx = argus.incx; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; + hipblasLocalHandle handle(argus); - hipblasHandle_t handle; - hipblasCreate(&handle); - - T* dx; - int* d_rocblas_result; - - int cpu_result, rocblas_result1, rocblas_result2; int zero = 0; // check to prevent undefined memory allocation error if(N < 1 || incx <= 0) { - CHECK_HIP_ERROR(hipMalloc(&dx, 100 * sizeof(T))); - CHECK_HIP_ERROR(hipMalloc(&d_rocblas_result, sizeof(int))); + device_vector dx(100); + int hipblas_result; + CHECK_HIPBLAS_ERROR(func(handle, N, dx, incx, &hipblas_result)); - status_1 = func(handle, N, dx, incx, &rocblas_result1); - - unit_check_general(1, 1, 1, &zero, &rocblas_result1); + unit_check_general(1, 1, 1, &zero, &hipblas_result); + return HIPBLAS_STATUS_SUCCESS; } - else - { - int sizeX = N * incx; - // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this - // practice - vector hx(sizeX); + int sizeX = N * incx; - // allocate memory on device - CHECK_HIP_ERROR(hipMalloc(&dx, sizeX * sizeof(T))); - CHECK_HIP_ERROR(hipMalloc(&d_rocblas_result, sizeof(int))); + // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this + // practice + host_vector hx(sizeX); + int cpu_result, hipblas_result_host, hipblas_result_device; - // Initial Data on CPU - srand(1); - hipblas_init(hx, 1, N, incx); + device_vector dx(sizeX); + device_vector d_hipblas_result(1); - // copy data from CPU to device, does not work for incx != 1 - CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); + // Initial Data on CPU + srand(1); + hipblas_init(hx, 1, N, incx); - /* ===================================================================== - HIP BLAS - =================================================================== */ - // device_pointer for d_rocblas_result - { + // copy data from CPU to device, does not work for incx != 1 + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + double gpu_time_used; + int hipblas_error_host, hipblas_error_device; - status_1 = func(handle, N, dx, incx, d_rocblas_result); + /* ===================================================================== + HIPBLAS + =================================================================== */ + // device_pointer + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(func(handle, N, dx, incx, d_hipblas_result)); - CHECK_HIP_ERROR( - hipMemcpy(&rocblas_result1, d_rocblas_result, sizeof(int), hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(&hipblas_result_device, d_hipblas_result, sizeof(int), hipMemcpyDeviceToHost)); + + // host_pointer + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(func(handle, N, dx, incx, &hipblas_result_host)); + + if(argus.unit_check || argus.norm_check) + { + /* ===================================================================== + CPU BLAS + =================================================================== */ + REFBLAS_FUNC(N, hx.data(), incx, &cpu_result); + // change to Fortran 1 based indexing as in BLAS standard, not cblas zero based indexing + cpu_result += 1; + + if(argus.unit_check) + { + unit_check_general(1, 1, 1, &cpu_result, &hipblas_result_host); + unit_check_general(1, 1, 1, &cpu_result, &hipblas_result_device); } - // host_pointer for rocblas_result2 - if((status_1 == HIPBLAS_STATUS_SUCCESS) && (status_3 == HIPBLAS_STATUS_SUCCESS)) + if(argus.norm_check) { - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - - status_2 = func(handle, N, dx, incx, &rocblas_result2); + hipblas_error_host = std::abs(hipblas_result_host - cpu_result); + hipblas_error_device = std::abs(hipblas_result_device - cpu_result); } + } - if((status_1 == HIPBLAS_STATUS_SUCCESS) && (status_2 == HIPBLAS_STATUS_SUCCESS) - && (status_3 == HIPBLAS_STATUS_SUCCESS)) - { - /* ===================================================================== - CPU BLAS - =================================================================== */ - REFBLAS_FUNC(N, hx.data(), incx, &cpu_result); - // change to Fortran 1 based indexing as in BLAS standard, not cblas zero based indexing - cpu_result += 1; + if(argus.timing) + { + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); - unit_check_general(1, 1, 1, &cpu_result, &rocblas_result1); - unit_check_general(1, 1, 1, &cpu_result, &rocblas_result2); + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); - } // end of if unit/norm check + CHECK_HIPBLAS_ERROR(func(handle, N, dx, incx, d_hipblas_result)); + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + iamax_gflop_count(N), + iamax_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - CHECK_HIP_ERROR(hipFree(dx)); - CHECK_HIP_ERROR(hipFree(d_rocblas_result)); - hipblasDestroy(handle); - - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - return status_1; - } - else if(status_2 != HIPBLAS_STATUS_SUCCESS) - { - return status_2; - } - else if(status_3 != HIPBLAS_STATUS_SUCCESS) - { - return status_3; - } - else - { - return HIPBLAS_STATUS_SUCCESS; - } + return HIPBLAS_STATUS_SUCCESS; } template diff --git a/clients/include/testing_iamax_iamin_batched.hpp b/clients/include/testing_iamax_iamin_batched.hpp index a35260dec..edfb28ddc 100644 --- a/clients/include/testing_iamax_iamin_batched.hpp +++ b/clients/include/testing_iamax_iamin_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -23,137 +23,125 @@ hipblasStatus_t testing_iamax_iamin_batched(const Arguments& arg int incx = argus.incx; int batch_count = argus.batch_count; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); + int zero = 0; // check to prevent undefined memory allocation error if(batch_count == 0) { // quick return success or invalid value - device_vector dx_array(5); - device_vector d_rocblas_result(1); + device_batch_vector dx(1, 1, 5); + device_vector d_hipblas_result(1); - status_1 = func(handle, N, dx_array, incx, batch_count, d_rocblas_result); + return func(handle, N, dx, incx, batch_count, d_hipblas_result); } else if(batch_count < 0) { - status_1 = HIPBLAS_STATUS_INVALID_VALUE; + return HIPBLAS_STATUS_INVALID_VALUE; } else if(N < 1 || incx <= 0) { // quick return success - device_vector dx_array(5); - host_vector h_rocblas_result(batch_count); - host_vector h_zeros(batch_count); + device_batch_vector dx(1, 1, 5); + host_vector h_hipblas_result(batch_count); + host_vector h_zeros(batch_count); for(int b = 0; b < batch_count; b++) h_zeros[b] = 0; - status_1 = func(handle, N, dx_array, incx, batch_count, h_rocblas_result); - unit_check_general(1, 1, batch_count, h_zeros, h_rocblas_result); + CHECK_HIPBLAS_ERROR(func(handle, N, dx, incx, batch_count, h_hipblas_result)); + unit_check_general(1, 1, batch_count, h_zeros, h_hipblas_result); + return HIPBLAS_STATUS_SUCCESS; } - else + host_batch_vector hx(N, incx, batch_count); + host_vector cpu_result(batch_count); + host_vector hipblas_result_host(batch_count); + host_vector hipblas_result_device(batch_count); + + device_batch_vector dx(N, incx, batch_count); + device_vector d_hipblas_result_device(batch_count); + CHECK_HIP_ERROR(dx.memcheck()); + + // Initial Data on CPU + hipblas_init(hx, true); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + + double gpu_time_used; + int hipblas_error_host = 0, hipblas_error_device = 0; + + /* ===================================================================== + HIPBLAS + =================================================================== */ + // device_pointer + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR( + func(handle, N, dx.ptr_on_device(), incx, batch_count, d_hipblas_result_device)); + CHECK_HIP_ERROR(hipMemcpy(hipblas_result_device, + d_hipblas_result_device, + sizeof(int) * batch_count, + hipMemcpyDeviceToHost)); + + // host_pointer + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR( + func(handle, N, dx.ptr_on_device(), incx, batch_count, hipblas_result_host)); + + if(argus.unit_check || argus.norm_check) { - host_vector cpu_result(batch_count); - host_vector rocblas_result_host(batch_count); - host_vector h_rocblas_result_device(batch_count); - device_vector rocblas_result_device(batch_count); - - int sizeX = N * incx; - - // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this - // practice - host_vector hx_array[batch_count]; - - device_batch_vector bx_array(batch_count, sizeX); - - device_vector dx_array(batch_count); - - if(!dx_array || (!bx_array[batch_count - 1] && sizeX)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } - - // Initial Data on CPU - srand(1); - for(int b = 0; b < batch_count; b++) - { - hx_array[b] = host_vector(sizeX); - - srand(1); - hipblas_init(hx_array[b], 1, N, incx); - - CHECK_HIP_ERROR( - hipMemcpy(bx_array[b], hx_array[b], sizeof(T) * sizeX, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR( - hipMemcpy(dx_array, bx_array, batch_count * sizeof(T*), hipMemcpyHostToDevice)); - /* ===================================================================== - HIP BLAS + CPU BLAS =================================================================== */ - // device_pointer for d_rocblas_result + for(int b = 0; b < batch_count; b++) { - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - status_1 = func(handle, N, dx_array, incx, batch_count, rocblas_result_device); - - CHECK_HIP_ERROR(hipMemcpy(h_rocblas_result_device, - rocblas_result_device, - sizeof(int) * batch_count, - hipMemcpyDeviceToHost)); + REFBLAS_FUNC(N, hx[b], incx, cpu_result + b); + // change to Fortran 1 based indexing as in BLAS standard, not cblas zero based indexing + cpu_result[b] += 1; } - // host_pointer for rocblas_result2 - if((status_1 == HIPBLAS_STATUS_SUCCESS) && (status_3 == HIPBLAS_STATUS_SUCCESS)) - { - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_2 = func(handle, N, dx_array, incx, batch_count, rocblas_result_host); + if(argus.unit_check) + { + unit_check_general(1, 1, batch_count, cpu_result, hipblas_result_host); + unit_check_general(1, 1, batch_count, cpu_result, hipblas_result_device); } - - if((status_1 == HIPBLAS_STATUS_SUCCESS) && (status_2 == HIPBLAS_STATUS_SUCCESS) - && (status_3 == HIPBLAS_STATUS_SUCCESS)) + if(argus.norm_check) { - /* ===================================================================== - CPU BLAS - =================================================================== */ for(int b = 0; b < batch_count; b++) { - REFBLAS_FUNC(N, hx_array[b], incx, cpu_result + b); - // change to Fortran 1 based indexing as in BLAS standard, not cblas zero based indexing - cpu_result[b] += 1; + hipblas_error_host = std::max(hipblas_error_host, + std::abs(hipblas_result_host[b] - cpu_result[b])); + hipblas_error_device = std::max(hipblas_error_device, + std::abs(hipblas_result_device[b] - cpu_result[b])); } + } + } // end of if unit/norm check - unit_check_general(1, 1, batch_count, cpu_result, h_rocblas_result_device); - unit_check_general(1, 1, batch_count, cpu_result, rocblas_result_host); - - } // end of if unit/norm check - } + if(argus.timing) + { + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); - hipblasDestroy(handle); + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - return status_1; - } - else if(status_2 != HIPBLAS_STATUS_SUCCESS) - { - return status_2; - } - else if(status_3 != HIPBLAS_STATUS_SUCCESS) - { - return status_3; - } - else - { - return HIPBLAS_STATUS_SUCCESS; + CHECK_HIPBLAS_ERROR( + func(handle, N, dx.ptr_on_device(), incx, batch_count, d_hipblas_result_device)); + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + iamax_gflop_count(N), + iamax_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } + + return HIPBLAS_STATUS_SUCCESS; } template diff --git a/clients/include/testing_iamax_iamin_strided_batched.hpp b/clients/include/testing_iamax_iamin_strided_batched.hpp index 6cc5ad542..b12b136d6 100644 --- a/clients/include/testing_iamax_iamin_strided_batched.hpp +++ b/clients/include/testing_iamax_iamin_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -32,115 +32,124 @@ hipblasStatus_t testing_iamax_iamin_strided_batched(const Arguments& hipblasStride stridex = N * incx * stride_scale; int sizeX = stridex * batch_count; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // check to prevent undefined memory allocation error if(batch_count == 0) { // quick return success or invalid value device_vector dx(100); - device_vector d_rocblas_result(1); + device_vector d_hipblas_result(1); - status_1 = func(handle, N, dx, incx, stridex, batch_count, d_rocblas_result); + return func(handle, N, dx, incx, stridex, batch_count, d_hipblas_result); } else if(batch_count < 0) { - status_1 = HIPBLAS_STATUS_INVALID_VALUE; + return HIPBLAS_STATUS_INVALID_VALUE; } else if(N < 1 || incx <= 0) { // quick return success device_vector dx(100); - host_vector h_rocblas_result(batch_count); + host_vector h_hipblas_result(batch_count); host_vector h_zeros(batch_count); for(int b = 0; b < batch_count; b++) h_zeros[b] = 0; - status_1 = func(handle, N, dx, incx, stridex, batch_count, h_rocblas_result); - unit_check_general(1, 1, batch_count, h_zeros, h_rocblas_result); + CHECK_HIPBLAS_ERROR(func(handle, N, dx, incx, stridex, batch_count, h_hipblas_result)); + unit_check_general(1, 1, batch_count, h_zeros, h_hipblas_result); + return HIPBLAS_STATUS_SUCCESS; } - else - { - // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this - // practice - host_vector hx(sizeX); - device_vector dx(sizeX); - host_vector cpu_result(batch_count); - host_vector rocblas_result1(batch_count); - host_vector rocblas_result2(batch_count); - device_vector d_rocblas_result(batch_count); - - // Initial Data on CPU - srand(1); - hipblas_init(hx, 1, N, incx, stridex, batch_count); - - // copy data from CPU to device, does not work for incx != 1 - CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * sizeX, hipMemcpyHostToDevice)); + // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this + // practice + host_vector hx(sizeX); + device_vector dx(sizeX); + host_vector cpu_result(batch_count); + host_vector hipblas_result_host(batch_count); + host_vector hipblas_result_device(batch_count); + device_vector d_hipblas_result(batch_count); + + // Initial Data on CPU + srand(1); + hipblas_init(hx, 1, N, incx, stridex, batch_count); + + // copy data from CPU to device, does not work for incx != 1 + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * sizeX, hipMemcpyHostToDevice)); + + double gpu_time_used; + int hipblas_error_host = 0, hipblas_error_device = 0; + /* ===================================================================== + HIP BLAS + =================================================================== */ + // device_pointer + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(func(handle, N, dx, incx, stridex, batch_count, d_hipblas_result)); + + CHECK_HIP_ERROR(hipMemcpy( + hipblas_result_device, d_hipblas_result, sizeof(int) * batch_count, hipMemcpyDeviceToHost)); + + // host_pointer + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(func(handle, N, dx, incx, stridex, batch_count, hipblas_result_host)); + + if(argus.unit_check || argus.norm_check) + { /* ===================================================================== - HIP BLAS + CPU BLAS =================================================================== */ - // device_pointer for d_rocblas_result + for(int b = 0; b < batch_count; b++) { - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - status_1 = func(handle, N, dx, incx, stridex, batch_count, d_rocblas_result); - - CHECK_HIP_ERROR(hipMemcpy(rocblas_result1, - d_rocblas_result, - sizeof(int) * batch_count, - hipMemcpyDeviceToHost)); + REFBLAS_FUNC(N, hx.data() + b * stridex, incx, &(cpu_result[b])); + // change to Fortran 1 based indexing as in BLAS standard, not cblas zero based indexing + cpu_result[b] += 1; } - // host_pointer for rocblas_result2 - if((status_1 == HIPBLAS_STATUS_SUCCESS) && (status_3 == HIPBLAS_STATUS_SUCCESS)) - { - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_2 = func(handle, N, dx, incx, stridex, batch_count, rocblas_result2.data()); + if(argus.unit_check) + { + unit_check_general( + 1, 1, batch_count, cpu_result.data(), hipblas_result_host.data()); + unit_check_general( + 1, 1, batch_count, cpu_result.data(), hipblas_result_device.data()); } - - if((status_1 == HIPBLAS_STATUS_SUCCESS) && (status_2 == HIPBLAS_STATUS_SUCCESS) - && (status_3 == HIPBLAS_STATUS_SUCCESS)) + if(argus.norm_check) { - /* ===================================================================== - CPU BLAS - =================================================================== */ for(int b = 0; b < batch_count; b++) { - REFBLAS_FUNC(N, hx.data() + b * stridex, incx, &(cpu_result[b])); - // change to Fortran 1 based indexing as in BLAS standard, not cblas zero based indexing - cpu_result[b] += 1; + hipblas_error_host = std::max(hipblas_error_host, + std::abs(hipblas_result_host[b] - cpu_result[b])); + hipblas_error_device = std::max(hipblas_error_device, + std::abs(hipblas_result_device[b] - cpu_result[b])); } + } + } // end of if unit/norm check - unit_check_general(1, 1, batch_count, cpu_result.data(), rocblas_result1.data()); - unit_check_general(1, 1, batch_count, cpu_result.data(), rocblas_result2.data()); - - } // end of if unit/norm check - } + if(argus.timing) + { + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); - hipblasDestroy(handle); + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - return status_1; - } - else if(status_2 != HIPBLAS_STATUS_SUCCESS) - { - return status_2; - } - else if(status_3 != HIPBLAS_STATUS_SUCCESS) - { - return status_3; - } - else - { - return HIPBLAS_STATUS_SUCCESS; + CHECK_HIPBLAS_ERROR(func(handle, N, dx, incx, stridex, batch_count, d_hipblas_result)); + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + iamax_gflop_count(N), + iamax_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } + + return HIPBLAS_STATUS_SUCCESS; } template diff --git a/clients/include/testing_nrm2.hpp b/clients/include/testing_nrm2.hpp index 81b1f53df..4682e594a 100644 --- a/clients/include/testing_nrm2.hpp +++ b/clients/include/testing_nrm2.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -13,105 +13,96 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_nrm2(const Arguments& argus) { + using Tr = real_t; bool FORTRAN = argus.fortran; - auto hipblasNrm2Fn = FORTRAN ? hipblasNrm2 : hipblasNrm2; + auto hipblasNrm2Fn = FORTRAN ? hipblasNrm2 : hipblasNrm2; int N = argus.N; int incx = argus.incx; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - // check to prevent undefined memory allocation error if(N < 0 || incx < 0) { - status_1 = HIPBLAS_STATUS_INVALID_VALUE; - return status_1; + return HIPBLAS_STATUS_INVALID_VALUE; } int sizeX = N * incx; // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - vector hx(sizeX); - - T1* dx; - T2* d_rocblas_result; - T2 cpu_result, rocblas_result_1, rocblas_result_2; - - int device_pointer = 1; + host_vector hx(sizeX); - double gpu_time_used, cpu_time_used; - double rocblas_error; + device_vector dx(sizeX); + device_vector d_hipblas_result(1); + Tr cpu_result, hipblas_result_host, hipblas_result_device; - hipblasHandle_t handle; - hipblasCreate(&handle); + double gpu_time_used, hipblas_error_host, hipblas_error_device; - // allocate memory on device - CHECK_HIP_ERROR(hipMalloc(&dx, sizeX * sizeof(T1))); - CHECK_HIP_ERROR(hipMalloc(&d_rocblas_result, sizeof(T2))); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); - hipblas_init(hx, 1, N, incx); + hipblas_init(hx, 1, N, incx); // copy data from CPU to device, does not work for incx != 1 - CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T1) * N * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); // hipblasNrm2 accept both dev/host pointer for the scalar + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasNrm2Fn(handle, N, dx, incx, d_hipblas_result)); - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - status_2 = hipblasNrm2Fn(handle, N, dx, incx, d_rocblas_result); - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - - status_4 = hipblasNrm2Fn(handle, N, dx, incx, &rocblas_result_1); - - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) - || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) - { - CHECK_HIP_ERROR(hipFree(dx)); - CHECK_HIP_ERROR(hipFree(d_rocblas_result)); - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - if(status_3 != HIPBLAS_STATUS_SUCCESS) - return status_3; - if(status_4 != HIPBLAS_STATUS_SUCCESS) - return status_4; - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasNrm2Fn(handle, N, dx, incx, &hipblas_result_host)); CHECK_HIP_ERROR( - hipMemcpy(&rocblas_result_2, d_rocblas_result, sizeof(T2), hipMemcpyDeviceToHost)); + hipMemcpy(&hipblas_result_device, d_hipblas_result, sizeof(Tr), hipMemcpyDeviceToHost)); if(argus.unit_check || argus.norm_check) { - /* ===================================================================== CPU BLAS =================================================================== */ - cblas_nrm2(N, hx.data(), incx, &cpu_result); + cblas_nrm2(N, hx.data(), incx, &cpu_result); if(argus.unit_check) { - unit_check_nrm2(cpu_result, rocblas_result_1, N); - unit_check_nrm2(cpu_result, rocblas_result_2, N); + unit_check_nrm2(cpu_result, hipblas_result_host, N); + unit_check_nrm2(cpu_result, hipblas_result_device, N); } + if(argus.norm_check) + { + hipblas_error_host = std::abs((cpu_result - hipblas_result_host) / cpu_result); + hipblas_error_device = std::abs((cpu_result - hipblas_result_device) / cpu_result); + } } // end of if unit/norm check - // BLAS_1_RESULT_PRINT + if(argus.timing) + { + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); - CHECK_HIP_ERROR(hipFree(dx)); - CHECK_HIP_ERROR(hipFree(d_rocblas_result)); - hipblasDestroy(handle); + CHECK_HIPBLAS_ERROR(hipblasNrm2Fn(handle, N, dx, incx, d_hipblas_result)); + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + nrm2_gflop_count(N), + nrm2_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_nrm2_batched.hpp b/clients/include/testing_nrm2_batched.hpp index e8fa90573..834fb201c 100644 --- a/clients/include/testing_nrm2_batched.hpp +++ b/clients/include/testing_nrm2_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -13,22 +13,18 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_nrm2_batched(const Arguments& argus) { + using Tr = real_t; bool FORTRAN = argus.fortran; auto hipblasNrm2BatchedFn - = FORTRAN ? hipblasNrm2Batched : hipblasNrm2Batched; + = FORTRAN ? hipblasNrm2Batched : hipblasNrm2Batched; int N = argus.N; int incx = argus.incx; int batch_count = argus.batch_count; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - // check to prevent undefined memory allocation error if(N < 0 || incx < 0 || batch_count < 0) { @@ -41,68 +37,39 @@ hipblasStatus_t testing_nrm2_batched(const Arguments& argus) int sizeX = N * incx; - double gpu_time_used, cpu_time_used; - double rocblas_error; + double gpu_time_used; + double hipblas_error_host = 0, hipblas_error_device = 0; hipblasHandle_t handle; hipblasCreate(&handle); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - host_vector hx_array[batch_count]; - host_vector h_cpu_result(batch_count); - host_vector h_rocblas_result_1(batch_count); - host_vector h_rocblas_result_2(batch_count); - - device_batch_vector bx_array(batch_count, sizeX); - - device_vector dx_array(batch_count); - device_vector d_rocblas_result(batch_count); + host_batch_vector hx(N, incx, batch_count); + host_vector h_cpu_result(batch_count); + host_vector h_hipblas_result_host(batch_count); + host_vector h_hipblas_result_device(batch_count); - int last = batch_count - 1; - if(!dx_array || !d_rocblas_result || (!bx_array[last] && sizeX)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } + device_batch_vector dx(N, incx, batch_count); + device_vector d_hipblas_result(batch_count); + CHECK_HIP_ERROR(dx.memcheck()); // Initial Data on CPU - srand(1); - for(int b = 0; b < batch_count; b++) - { - hx_array[b] = host_vector(sizeX); - - srand(1); - hipblas_init(hx_array[b], 1, N, incx); - - CHECK_HIP_ERROR( - hipMemcpy(bx_array[b], hx_array[b], sizeof(T1) * sizeX, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR( - hipMemcpy(dx_array, bx_array, sizeof(T1*) * batch_count, hipMemcpyHostToDevice)); + hipblas_init(hx, true); + CHECK_HIP_ERROR(dx.transfer_from(hx)); // hipblasNrm2 accept both dev/host pointer for the scalar - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 = hipblasNrm2BatchedFn(handle, N, dx_array, incx, batch_count, d_rocblas_result); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR( + hipblasNrm2BatchedFn(handle, N, dx.ptr_on_device(), incx, batch_count, d_hipblas_result)); - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 = hipblasNrm2BatchedFn(handle, N, dx_array, incx, batch_count, h_rocblas_result_1); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasNrm2BatchedFn( + handle, N, dx.ptr_on_device(), incx, batch_count, h_hipblas_result_host)); - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) - || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - if(status_3 != HIPBLAS_STATUS_SUCCESS) - return status_3; - if(status_4 != HIPBLAS_STATUS_SUCCESS) - return status_4; - } - - CHECK_HIP_ERROR(hipMemcpy( - h_rocblas_result_2, d_rocblas_result, sizeof(T2) * batch_count, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(h_hipblas_result_device, + d_hipblas_result, + sizeof(Tr) * batch_count, + hipMemcpyDeviceToHost)); if(argus.unit_check || argus.norm_check) { @@ -111,19 +78,53 @@ hipblasStatus_t testing_nrm2_batched(const Arguments& argus) =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_nrm2(N, hx_array[b], incx, &(h_cpu_result[b])); + cblas_nrm2(N, hx[b], incx, &(h_cpu_result[b])); } if(argus.unit_check) { - unit_check_nrm2(batch_count, h_cpu_result, h_rocblas_result_1, N); - unit_check_nrm2(batch_count, h_cpu_result, h_rocblas_result_2, N); + unit_check_nrm2(batch_count, h_cpu_result, h_hipblas_result_host, N); + unit_check_nrm2(batch_count, h_cpu_result, h_hipblas_result_device, N); + } + if(argus.norm_check) + { + for(int b = 0; b < batch_count; b++) + { + hipblas_error_host = std::max( + Tr(hipblas_error_host), + std::abs((h_cpu_result[b] - h_hipblas_result_host[b]) / h_cpu_result[b])); + hipblas_error_device = std::max( + Tr(hipblas_error_device), + std::abs((h_cpu_result[b] - h_hipblas_result_device[b]) / h_cpu_result[b])); + } } } // end of if unit/norm check - // BLAS_1_RESULT_PRINT + if(argus.timing) + { + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); - hipblasDestroy(handle); + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasNrm2BatchedFn( + handle, N, dx.ptr_on_device(), incx, batch_count, d_hipblas_result)); + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + nrm2_gflop_count(N), + nrm2_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_nrm2_strided_batched.hpp b/clients/include/testing_nrm2_strided_batched.hpp index e8b0969d1..270d793be 100644 --- a/clients/include/testing_nrm2_strided_batched.hpp +++ b/clients/include/testing_nrm2_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -13,12 +13,13 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_nrm2_strided_batched(const Arguments& argus) { + using Tr = real_t; bool FORTRAN = argus.fortran; - auto hipblasNrm2StridedBatchedFn = FORTRAN ? hipblasNrm2StridedBatched - : hipblasNrm2StridedBatched; + auto hipblasNrm2StridedBatchedFn = FORTRAN ? hipblasNrm2StridedBatched + : hipblasNrm2StridedBatched; int N = argus.N; int incx = argus.incx; @@ -28,11 +29,6 @@ hipblasStatus_t testing_nrm2_strided_batched(const Arguments& argus) hipblasStride stridex = N * incx * stride_scale; int sizeX = stridex * batch_count; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - // check to prevent undefined memory allocation error if(N < 0 || incx < 0 || batch_count < 0) { @@ -44,53 +40,39 @@ hipblasStatus_t testing_nrm2_strided_batched(const Arguments& argus) } // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - host_vector hx(sizeX); - host_vector h_rocblas_result_1(batch_count); - host_vector h_rocblas_result_2(batch_count); - host_vector h_cpu_result(batch_count); + host_vector hx(sizeX); + host_vector h_hipblas_result_host(batch_count); + host_vector h_hipblas_result_device(batch_count); + host_vector h_cpu_result(batch_count); - device_vector dx(sizeX); - device_vector d_rocblas_result(batch_count); + device_vector dx(sizeX); + device_vector d_hipblas_result(batch_count); - double gpu_time_used, cpu_time_used; - double rocblas_error; + double gpu_time_used; + double hipblas_error_host = 0, hipblas_error_device = 0; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); - hipblas_init(hx, 1, N, incx, stridex, batch_count); + hipblas_init(hx, 1, N, incx, stridex, batch_count); // copy data from CPU to device, does not work for incx != 1 - CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T1) * sizeX, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * sizeX, hipMemcpyHostToDevice)); // hipblasNrm2 accept both dev/host pointer for the scalar + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR( + hipblasNrm2StridedBatchedFn(handle, N, dx, incx, stridex, batch_count, d_hipblas_result)); - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 - = hipblasNrm2StridedBatchedFn(handle, N, dx, incx, stridex, batch_count, d_rocblas_result); - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 = hipblasNrm2StridedBatchedFn( - handle, N, dx, incx, stridex, batch_count, h_rocblas_result_1); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasNrm2StridedBatchedFn( + handle, N, dx, incx, stridex, batch_count, h_hipblas_result_host)); - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) - || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - if(status_3 != HIPBLAS_STATUS_SUCCESS) - return status_3; - if(status_4 != HIPBLAS_STATUS_SUCCESS) - return status_4; - } - - CHECK_HIP_ERROR(hipMemcpy( - h_rocblas_result_2, d_rocblas_result, sizeof(T2) * batch_count, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(h_hipblas_result_device, + d_hipblas_result, + sizeof(Tr) * batch_count, + hipMemcpyDeviceToHost)); if(argus.unit_check || argus.norm_check) { @@ -99,19 +81,52 @@ hipblasStatus_t testing_nrm2_strided_batched(const Arguments& argus) =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_nrm2(N, hx.data() + b * stridex, incx, &(h_cpu_result[b])); + cblas_nrm2(N, hx.data() + b * stridex, incx, &(h_cpu_result[b])); } if(argus.unit_check) { - unit_check_nrm2(batch_count, h_cpu_result, h_rocblas_result_1, N); - unit_check_nrm2(batch_count, h_cpu_result, h_rocblas_result_2, N); + unit_check_nrm2(batch_count, h_cpu_result, h_hipblas_result_host, N); + unit_check_nrm2(batch_count, h_cpu_result, h_hipblas_result_device, N); + } + if(argus.norm_check) + { + for(int b = 0; b < batch_count; b++) + { + hipblas_error_host = std::max( + Tr(hipblas_error_host), + std::abs((h_cpu_result[b] - h_hipblas_result_host[b]) / h_cpu_result[b])); + hipblas_error_device = std::max( + Tr(hipblas_error_device), + std::abs((h_cpu_result[b] - h_hipblas_result_device[b]) / h_cpu_result[b])); + } } - } // end of if unit/norm check - // BLAS_1_RESULT_PRINT + if(argus.timing) + { + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); - hipblasDestroy(handle); + CHECK_HIPBLAS_ERROR(hipblasNrm2StridedBatchedFn( + handle, N, dx, incx, stridex, batch_count, d_hipblas_result)); + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + nrm2_gflop_count(N), + nrm2_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } return HIPBLAS_STATUS_SUCCESS; } From e0adcf622ba9a7ce5555db4629c1e938c72242d1 Mon Sep 17 00:00:00 2001 From: Daine McNiven <51674140+daineAMD@users.noreply.github.com> Date: Mon, 15 Mar 2021 11:04:38 -0600 Subject: [PATCH 3/8] Cleanup for asum, axpy, dot, scal, swap tests. (#318) --- clients/gtest/auxiliary_gtest.cpp | 1 - clients/include/testing_asum.hpp | 55 ++------ clients/include/testing_asum_batched.hpp | 58 ++------- .../include/testing_asum_strided_batched.hpp | 57 ++------- clients/include/testing_axpy.hpp | 52 ++------ clients/include/testing_axpy_batched.hpp | 84 ++++-------- .../include/testing_axpy_strided_batched.hpp | 58 ++------- clients/include/testing_copy.hpp | 50 ++------ clients/include/testing_copy_batched.hpp | 101 ++++----------- .../include/testing_copy_strided_batched.hpp | 36 ++---- clients/include/testing_dot.hpp | 73 ++--------- clients/include/testing_dot_batched.hpp | 95 +++++--------- .../include/testing_dot_strided_batched.hpp | 57 ++------- clients/include/testing_scal.hpp | 54 +++----- clients/include/testing_scal_batched.hpp | 40 ++---- .../include/testing_scal_strided_batched.hpp | 34 ++--- clients/include/testing_swap.hpp | 69 ++++------ clients/include/testing_swap_batched.hpp | 121 ++++++------------ .../include/testing_swap_strided_batched.hpp | 37 ++---- 19 files changed, 287 insertions(+), 845 deletions(-) diff --git a/clients/gtest/auxiliary_gtest.cpp b/clients/gtest/auxiliary_gtest.cpp index 399cb1714..e989596b0 100644 --- a/clients/gtest/auxiliary_gtest.cpp +++ b/clients/gtest/auxiliary_gtest.cpp @@ -2,7 +2,6 @@ * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ -#pragma once #include "utility.h" #include diff --git a/clients/include/testing_asum.hpp b/clients/include/testing_asum.hpp index 41c74312a..63c3f5c42 100644 --- a/clients/include/testing_asum.hpp +++ b/clients/include/testing_asum.hpp @@ -23,16 +23,10 @@ hipblasStatus_t testing_asum(const Arguments& argus) int N = argus.N; int incx = argus.incx; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - // check to prevent undefined memory allocation error if(N < 0 || incx < 0) { - status_1 = HIPBLAS_STATUS_INVALID_VALUE; - return status_1; + return HIPBLAS_STATUS_INVALID_VALUE; } int sizeX = N * incx; @@ -46,8 +40,7 @@ hipblasStatus_t testing_asum(const Arguments& argus) double gpu_time_used, hipblas_error_host = 0, hipblas_error_device = 0; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -60,25 +53,11 @@ hipblasStatus_t testing_asum(const Arguments& argus) HIPBLAS =================================================================== */ // hipblasAsum accept both dev/host pointer for the scalar - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 = hipblasAsumFn(handle, N, dx, incx, d_hipblas_result); - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 = hipblasAsumFn(handle, N, dx, incx, &hipblas_result_host); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasAsumFn(handle, N, dx, incx, d_hipblas_result)); - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) - || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - if(status_3 != HIPBLAS_STATUS_SUCCESS) - return status_3; - if(status_4 != HIPBLAS_STATUS_SUCCESS) - return status_4; - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasAsumFn(handle, N, dx, incx, &hipblas_result_host)); CHECK_HIP_ERROR( hipMemcpy(&hipblas_result_device, d_hipblas_result, sizeof(Tr), hipMemcpyDeviceToHost)); @@ -109,17 +88,8 @@ hipblasStatus_t testing_asum(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status_1 = hipblasGetStream(handle, &stream); - status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -127,13 +97,7 @@ hipblasStatus_t testing_asum(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status_1 = hipblasAsumFn(handle, N, dx, incx, d_hipblas_result); - - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status_1; - } + CHECK_HIPBLAS_ERROR(hipblasAsumFn(handle, N, dx, incx, d_hipblas_result)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -146,6 +110,5 @@ hipblasStatus_t testing_asum(const Arguments& argus) hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_asum_batched.hpp b/clients/include/testing_asum_batched.hpp index db2f34fcf..ee3e7acd0 100644 --- a/clients/include/testing_asum_batched.hpp +++ b/clients/include/testing_asum_batched.hpp @@ -25,11 +25,6 @@ hipblasStatus_t testing_asum_batched(const Arguments& argus) int incx = argus.incx; int batch_count = argus.batch_count; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - // check to prevent undefined memory allocation error if(N < 0 || incx < 0 || batch_count < 0) { @@ -44,8 +39,7 @@ hipblasStatus_t testing_asum_batched(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice host_batch_vector hx(N, incx, batch_count); @@ -65,28 +59,13 @@ hipblasStatus_t testing_asum_batched(const Arguments& argus) HIPBLAS =================================================================== */ // hipblasAsum accept both dev/host pointer for the scalar - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 - = hipblasAsumBatchedFn(handle, N, dx.ptr_on_device(), incx, batch_count, d_hipblas_result); - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 = hipblasAsumBatchedFn( - handle, N, dx.ptr_on_device(), incx, batch_count, h_hipblas_result_host); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR( + hipblasAsumBatchedFn(handle, N, dx.ptr_on_device(), incx, batch_count, d_hipblas_result)); - if((status_1 != HIPBLAS_STATUS_SUCCESS) - || (status_2 != HIPBLAS_STATUS_SUCCESS || (status_3 != HIPBLAS_STATUS_SUCCESS) - || (status_4 != HIPBLAS_STATUS_SUCCESS))) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - if(status_3 != HIPBLAS_STATUS_SUCCESS) - return status_3; - if(status_4 != HIPBLAS_STATUS_SUCCESS) - return status_4; - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasAsumBatchedFn( + handle, N, dx.ptr_on_device(), incx, batch_count, h_hipblas_result_host)); CHECK_HIP_ERROR(hipMemcpy(h_hipblas_result_device, d_hipblas_result, @@ -121,16 +100,8 @@ hipblasStatus_t testing_asum_batched(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status_1 = hipblasGetStream(handle, &stream); - status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - if(status_1 != HIPBLAS_STATUS_SUCCESS || status_2 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -138,14 +109,8 @@ hipblasStatus_t testing_asum_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status_1 = hipblasAsumBatchedFn( - handle, N, dx.ptr_on_device(), incx, batch_count, d_hipblas_result); - - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status_1; - } + CHECK_HIPBLAS_ERROR(hipblasAsumBatchedFn( + handle, N, dx.ptr_on_device(), incx, batch_count, d_hipblas_result)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -158,6 +123,5 @@ hipblasStatus_t testing_asum_batched(const Arguments& argus) hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_asum_strided_batched.hpp b/clients/include/testing_asum_strided_batched.hpp index da2d991ed..d32fdca1a 100644 --- a/clients/include/testing_asum_strided_batched.hpp +++ b/clients/include/testing_asum_strided_batched.hpp @@ -29,11 +29,6 @@ hipblasStatus_t testing_asum_strided_batched(const Arguments& argus) hipblasStride stridex = N * incx * stride_scale; int sizeX = stridex * batch_count; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - double gpu_time_used, hipblas_error_host, hipblas_error_device; // check to prevent undefined memory allocation error @@ -47,8 +42,7 @@ hipblasStatus_t testing_asum_strided_batched(const Arguments& argus) return HIPBLAS_STATUS_SUCCESS; } - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice host_vector hx(sizeX); @@ -70,27 +64,13 @@ hipblasStatus_t testing_asum_strided_batched(const Arguments& argus) HIPBLAS =================================================================== */ // hipblasAsum accept both dev/host pointer for the scalar - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 - = hipblasAsumStridedBatchedFn(handle, N, dx, incx, stridex, batch_count, d_hipblas_result); - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 = hipblasAsumStridedBatchedFn( - handle, N, dx, incx, stridex, batch_count, hipblas_result_host); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR( + hipblasAsumStridedBatchedFn(handle, N, dx, incx, stridex, batch_count, d_hipblas_result)); - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) - || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - if(status_3 != HIPBLAS_STATUS_SUCCESS) - return status_3; - if(status_4 != HIPBLAS_STATUS_SUCCESS) - return status_4; - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasAsumStridedBatchedFn( + handle, N, dx, incx, stridex, batch_count, hipblas_result_host)); CHECK_HIP_ERROR(hipMemcpy( hipblas_result_device, d_hipblas_result, sizeof(Tr) * batch_count, hipMemcpyDeviceToHost)); @@ -123,16 +103,8 @@ hipblasStatus_t testing_asum_strided_batched(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status_1 = hipblasGetStream(handle, &stream); - status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - if(status_1 != HIPBLAS_STATUS_SUCCESS || status_2 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -140,14 +112,8 @@ hipblasStatus_t testing_asum_strided_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status_1 = hipblasAsumStridedBatchedFn( - handle, N, dx, incx, stridex, batch_count, d_hipblas_result); - - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status_1; - } + CHECK_HIPBLAS_ERROR(hipblasAsumStridedBatchedFn( + handle, N, dx, incx, stridex, batch_count, d_hipblas_result)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -160,6 +126,5 @@ hipblasStatus_t testing_asum_strided_batched(const Arguments& argus) hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_axpy.hpp b/clients/include/testing_axpy.hpp index cdaabc30d..86892455b 100644 --- a/clients/include/testing_axpy.hpp +++ b/clients/include/testing_axpy.hpp @@ -23,11 +23,6 @@ hipblasStatus_t testing_axpy(const Arguments& argus) int incx = argus.incx; int incy = argus.incy; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - int abs_incx = incx < 0 ? -incx : incx; int abs_incy = incy < 0 ? -incy : incy; @@ -56,8 +51,7 @@ hipblasStatus_t testing_axpy(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -78,25 +72,11 @@ hipblasStatus_t testing_axpy(const Arguments& argus) /* ===================================================================== HIPBLAS =================================================================== */ - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 = hipblasAxpyFn(handle, N, d_alpha, dx, incx, dy_device, incy); - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 = hipblasAxpyFn(handle, N, &alpha, dx, incx, dy_host, incy); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasAxpyFn(handle, N, d_alpha, dx, incx, dy_device, incy)); - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) - || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - if(status_3 != HIPBLAS_STATUS_SUCCESS) - return status_3; - if(status_4 != HIPBLAS_STATUS_SUCCESS) - return status_4; - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasAxpyFn(handle, N, &alpha, dx, incx, dy_host, incy)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy_host, sizeof(T) * sizeY, hipMemcpyDeviceToHost)); @@ -128,17 +108,8 @@ hipblasStatus_t testing_axpy(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status_1 = hipblasGetStream(handle, &stream); - status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -146,13 +117,7 @@ hipblasStatus_t testing_axpy(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status_1 = hipblasAxpyFn(handle, N, d_alpha, dx, incx, dy_device, incy); - - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status_1; - } + CHECK_HIPBLAS_ERROR(hipblasAxpyFn(handle, N, d_alpha, dx, incx, dy_device, incy)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -165,6 +130,5 @@ hipblasStatus_t testing_axpy(const Arguments& argus) hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_axpy_batched.hpp b/clients/include/testing_axpy_batched.hpp index 2ff4c1e5c..6e2781ef2 100644 --- a/clients/include/testing_axpy_batched.hpp +++ b/clients/include/testing_axpy_batched.hpp @@ -24,13 +24,8 @@ hipblasStatus_t testing_axpy_batched(const Arguments& argus) int incx = argus.incx; int incy = argus.incy; int batch_count = argus.batch_count; - - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - int abs_incx = incx < 0 ? -incx : incx; - int abs_incy = incy < 0 ? -incy : incy; + int abs_incx = incx < 0 ? -incx : incx; + int abs_incy = incy < 0 ? -incy : incy; // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -49,8 +44,7 @@ hipblasStatus_t testing_axpy_batched(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice host_batch_vector hx(N, incx, batch_count); @@ -80,27 +74,19 @@ hipblasStatus_t testing_axpy_batched(const Arguments& argus) /* ===================================================================== HIPBLAS =================================================================== */ - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 = hipblasAxpyBatchedFn( - handle, N, d_alpha, dx.ptr_on_device(), incx, dy_device.ptr_on_device(), incy, batch_count); - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 = hipblasAxpyBatchedFn( - handle, N, &alpha, dx.ptr_on_device(), incx, dy_host.ptr_on_device(), incy, batch_count); - - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) - || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - if(status_3 != HIPBLAS_STATUS_SUCCESS) - return status_3; - if(status_4 != HIPBLAS_STATUS_SUCCESS) - return status_4; - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasAxpyBatchedFn(handle, + N, + d_alpha, + dx.ptr_on_device(), + incx, + dy_device.ptr_on_device(), + incy, + batch_count)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasAxpyBatchedFn( + handle, N, &alpha, dx.ptr_on_device(), incx, dy_host.ptr_on_device(), incy, batch_count)); CHECK_HIP_ERROR(hy_host.transfer_from(dy_host)); CHECK_HIP_ERROR(hy_device.transfer_from(dy_device)); @@ -133,17 +119,8 @@ hipblasStatus_t testing_axpy_batched(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status_1 = hipblasGetStream(handle, &stream); - status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -151,20 +128,14 @@ hipblasStatus_t testing_axpy_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status_1 = hipblasAxpyBatchedFn(handle, - N, - d_alpha, - dx.ptr_on_device(), - incx, - dy_device.ptr_on_device(), - incy, - batch_count); - - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status_1; - } + CHECK_HIPBLAS_ERROR(hipblasAxpyBatchedFn(handle, + N, + d_alpha, + dx.ptr_on_device(), + incx, + dy_device.ptr_on_device(), + incy, + batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -177,8 +148,5 @@ hipblasStatus_t testing_axpy_batched(const Arguments& argus) hipblas_error_device); } - // BLAS_1_RESULT_PRINT - - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_axpy_strided_batched.hpp b/clients/include/testing_axpy_strided_batched.hpp index e601e5756..1de44ec19 100644 --- a/clients/include/testing_axpy_strided_batched.hpp +++ b/clients/include/testing_axpy_strided_batched.hpp @@ -35,11 +35,6 @@ hipblasStatus_t testing_axpy_strided_batched(const Arguments& argus) int sizeX = stridex * batch_count; int sizeY = stridey * batch_count; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || !incx || !incy || batch_count < 0) @@ -65,8 +60,7 @@ hipblasStatus_t testing_axpy_strided_batched(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -87,27 +81,13 @@ hipblasStatus_t testing_axpy_strided_batched(const Arguments& argus) /* ===================================================================== HIPBLAS =================================================================== */ - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 = hipblasAxpyStridedBatchedFn( - handle, N, d_alpha, dx, incx, stridex, dy_device, incy, stridey, batch_count); - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 = hipblasAxpyStridedBatchedFn( - handle, N, &alpha, dx, incx, stridex, dy_host, incy, stridey, batch_count); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasAxpyStridedBatchedFn( + handle, N, d_alpha, dx, incx, stridex, dy_device, incy, stridey, batch_count)); - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) - || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - if(status_3 != HIPBLAS_STATUS_SUCCESS) - return status_3; - if(status_4 != HIPBLAS_STATUS_SUCCESS) - return status_4; - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasAxpyStridedBatchedFn( + handle, N, &alpha, dx, incx, stridex, dy_host, incy, stridey, batch_count)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy_host, sizeof(T) * sizeX, hipMemcpyDeviceToHost)); @@ -146,17 +126,8 @@ hipblasStatus_t testing_axpy_strided_batched(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status_1 = hipblasGetStream(handle, &stream); - status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -164,14 +135,8 @@ hipblasStatus_t testing_axpy_strided_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status_1 = hipblasAxpyStridedBatchedFn( - handle, N, d_alpha, dx, incx, stridex, dy_device, incy, stridey, batch_count); - - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status_1; - } + CHECK_HIPBLAS_ERROR(hipblasAxpyStridedBatchedFn( + handle, N, d_alpha, dx, incx, stridex, dy_device, incy, stridey, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -185,6 +150,5 @@ hipblasStatus_t testing_axpy_strided_batched(const Arguments& argus) hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_copy.hpp b/clients/include/testing_copy.hpp index 192d513f3..a14164e4d 100644 --- a/clients/include/testing_copy.hpp +++ b/clients/include/testing_copy.hpp @@ -23,31 +23,28 @@ hipblasStatus_t testing_copy(const Arguments& argus) int incx = argus.incx; int incy = argus.incy; - int unit_check = argus.unit_check; - int timing = argus.timing; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + int unit_check = argus.unit_check; + int timing = argus.timing; // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0) { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; + return HIPBLAS_STATUS_INVALID_VALUE; } else if(incx < 0) { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; + return HIPBLAS_STATUS_INVALID_VALUE; } int sizeX = N * incx; int sizeY = N * incy; // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - vector hx(sizeX); - vector hy(sizeY); - vector hx_cpu(sizeX); - vector hy_cpu(sizeY); + host_vector hx(sizeX); + host_vector hy(sizeY); + host_vector hx_cpu(sizeX); + host_vector hy_cpu(sizeY); // allocate memory on device device_vector dx(sizeX); @@ -56,9 +53,7 @@ hipblasStatus_t testing_copy(const Arguments& argus) double hipblas_error = 0.0; double gpu_time_used = 0.0; - hipblasHandle_t handle; - - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -72,14 +67,9 @@ hipblasStatus_t testing_copy(const Arguments& argus) CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - status = hipblasCopyFn(handle, N, dx, incx, dy, incy); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasCopyFn(handle, N, dx, incx, dy, incy)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hx.data(), dx, sizeof(T) * sizeX, hipMemcpyDeviceToHost)); @@ -105,12 +95,7 @@ hipblasStatus_t testing_copy(const Arguments& argus) if(timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -118,13 +103,7 @@ hipblasStatus_t testing_copy(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status = hipblasCopyFn(handle, N, dx, incx, dy, incy); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasCopyFn(handle, N, dx, incx, dy, incy)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -136,6 +115,5 @@ hipblasStatus_t testing_copy(const Arguments& argus) hipblas_error); } - hipblasDestroy(handle); - return status; + return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_copy_batched.hpp b/clients/include/testing_copy_batched.hpp index 41143e52b..0e12ea596 100644 --- a/clients/include/testing_copy_batched.hpp +++ b/clients/include/testing_copy_batched.hpp @@ -25,9 +25,8 @@ hipblasStatus_t testing_copy_batched(const Arguments& argus) int incy = argus.incy; int batch_count = argus.batch_count; - int unit_check = argus.unit_check; - int timing = argus.timing; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + int unit_check = argus.unit_check; + int timing = argus.timing; // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -46,69 +45,36 @@ hipblasStatus_t testing_copy_batched(const Arguments& argus) double hipblas_error = 0.0; double gpu_time_used = 0.0; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - host_vector hx_array[batch_count]; - host_vector hy_array[batch_count]; - host_vector hx_cpu_array[batch_count]; - host_vector hy_cpu_array[batch_count]; + host_batch_vector hx(N, incx, batch_count); + host_batch_vector hy(N, incy, batch_count); + host_batch_vector hx_cpu(N, incx, batch_count); + host_batch_vector hy_cpu(N, incy, batch_count); - device_batch_vector bx_array(batch_count, sizeX); - device_batch_vector by_array(batch_count, sizeY); + device_batch_vector dx(N, incx, batch_count); + device_batch_vector dy(N, incy, batch_count); + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy.memcheck()); - device_vector dx_array(batch_count); - device_vector dy_array(batch_count); + hipblas_init(hx, true); + hipblas_init(hy, false); - int last = batch_count - 1; - if(!dx_array || !dy_array || (!bx_array[last] && sizeX) || (!by_array[last] && sizeY)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } - - srand(1); - for(int b = 0; b < batch_count; b++) - { - hx_array[b] = host_vector(sizeX); - hy_array[b] = host_vector(sizeY); - hx_cpu_array[b] = host_vector(sizeX); - hy_cpu_array[b] = host_vector(sizeY); - - srand(1); - hipblas_init(hx_array[b], 1, N, incx); - hipblas_init(hy_array[b], 1, N, incy); - - hx_cpu_array[b] = hx_array[b]; - hy_cpu_array[b] = hy_array[b]; - - CHECK_HIP_ERROR( - hipMemcpy(bx_array[b], hx_array[b], sizeof(T) * sizeX, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR( - hipMemcpy(by_array[b], hy_array[b], sizeof(T) * sizeY, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR(hipMemcpy(dx_array, bx_array, batch_count * sizeof(T*), hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy_array, by_array, batch_count * sizeof(T*), hipMemcpyHostToDevice)); + hx_cpu.copy_from(hx); + hy_cpu.copy_from(hy); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - status = hipblasCopyBatchedFn(handle, N, dx_array, incx, dy_array, incy, batch_count); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasCopyBatchedFn( + handle, N, dx.ptr_on_device(), incx, dy.ptr_on_device(), incy, batch_count)); // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - CHECK_HIP_ERROR( - hipMemcpy(hx_array[b], bx_array[b], sizeof(T) * sizeX, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR( - hipMemcpy(hy_array[b], by_array[b], sizeof(T) * sizeY, hipMemcpyDeviceToHost)); - } + CHECK_HIP_ERROR(hx.transfer_from(dx)); + CHECK_HIP_ERROR(hy.transfer_from(dy)); if(unit_check) { @@ -118,14 +84,14 @@ hipblasStatus_t testing_copy_batched(const Arguments& argus) for(int b = 0; b < batch_count; b++) { - cblas_copy(N, hx_cpu_array[b], incx, hy_cpu_array[b], incy); + cblas_copy(N, hx_cpu[b], incx, hy_cpu[b], incy); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, N, batch_count, incy, hy_cpu_array, hy_array); + unit_check_general(1, N, batch_count, incy, hy_cpu, hy); } } // end of if unit check @@ -133,12 +99,7 @@ hipblasStatus_t testing_copy_batched(const Arguments& argus) if(timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -146,13 +107,8 @@ hipblasStatus_t testing_copy_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status = hipblasCopyBatchedFn(handle, N, dx_array, incx, dy_array, incy, batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasCopyBatchedFn( + handle, N, dx.ptr_on_device(), incx, dy.ptr_on_device(), incy, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -164,6 +120,5 @@ hipblasStatus_t testing_copy_batched(const Arguments& argus) hipblas_error); } - hipblasDestroy(handle); - return status; + return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_copy_strided_batched.hpp b/clients/include/testing_copy_strided_batched.hpp index d528d7aef..084f687fc 100644 --- a/clients/include/testing_copy_strided_batched.hpp +++ b/clients/include/testing_copy_strided_batched.hpp @@ -34,8 +34,6 @@ hipblasStatus_t testing_copy_strided_batched(const Arguments& argus) int sizeX = stridex * batch_count; int sizeY = stridey * batch_count; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx < 0 || batch_count < 0) @@ -55,8 +53,7 @@ hipblasStatus_t testing_copy_strided_batched(const Arguments& argus) double gpu_time_used = 0.0; double hipblas_error = 0.0; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -70,15 +67,10 @@ hipblasStatus_t testing_copy_strided_batched(const Arguments& argus) CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - status - = hipblasCopyStridedBatchedFn(handle, N, dx, incx, stridex, dy, incy, stridey, batch_count); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR( + hipblasCopyStridedBatchedFn(handle, N, dx, incx, stridex, dy, incy, stridey, batch_count)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hx.data(), dx, sizeof(T) * sizeX, hipMemcpyDeviceToHost)); @@ -106,12 +98,7 @@ hipblasStatus_t testing_copy_strided_batched(const Arguments& argus) if(timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -119,14 +106,8 @@ hipblasStatus_t testing_copy_strided_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status = hipblasCopyStridedBatchedFn( - handle, N, dx, incx, stridex, dy, incy, stridey, batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasCopyStridedBatchedFn( + handle, N, dx, incx, stridex, dy, incy, stridey, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -139,6 +120,5 @@ hipblasStatus_t testing_copy_strided_batched(const Arguments& argus) hipblas_error); } - hipblasDestroy(handle); - return status; + return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_dot.hpp b/clients/include/testing_dot.hpp index 8336007cd..4d4facab4 100644 --- a/clients/include/testing_dot.hpp +++ b/clients/include/testing_dot.hpp @@ -24,33 +24,19 @@ hipblasStatus_t testing_dot(const Arguments& argus) int incx = argus.incx; int incy = argus.incy; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory - if(N < 0) - { - status_1 = HIPBLAS_STATUS_INVALID_VALUE; - return status_1; - } - else if(incx < 0) + if(N < 0 || incx < 0 || incy < 0) { - status_1 = HIPBLAS_STATUS_INVALID_VALUE; - return status_1; - } - else if(incy < 0) - { - status_1 = HIPBLAS_STATUS_INVALID_VALUE; - return status_1; + return HIPBLAS_STATUS_INVALID_VALUE; } int sizeX = N * incx; int sizeY = N * incy; // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - vector hx(sizeX); - vector hy(sizeY); + host_vector hx(sizeX); + host_vector hy(sizeY); T cpu_result, h_hipblas_result_1, h_hipblas_result_2; device_vector dx(sizeX); @@ -59,8 +45,7 @@ hipblasStatus_t testing_dot(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -75,34 +60,17 @@ hipblasStatus_t testing_dot(const Arguments& argus) HIPBLAS =================================================================== */ // hipblasDot accept both dev/host pointer for the scalar - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 = (hipblasDotFn)(handle, N, dx, incx, dy, incy, d_hipblas_result); - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR((hipblasDotFn)(handle, N, dx, incx, dy, incy, d_hipblas_result)); - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_2 = (hipblasDotFn)(handle, N, dx, incx, dy, incy, &h_hipblas_result_1); - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR((hipblasDotFn)(handle, N, dx, incx, dy, incy, &h_hipblas_result_1)); CHECK_HIP_ERROR( hipMemcpy(&h_hipblas_result_2, d_hipblas_result, sizeof(T), hipMemcpyDeviceToHost)); if(argus.unit_check || argus.norm_check) { - /* ===================================================================== CPU BLAS =================================================================== */ @@ -126,17 +94,8 @@ hipblasStatus_t testing_dot(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status_1 = hipblasGetStream(handle, &stream); - status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -144,13 +103,7 @@ hipblasStatus_t testing_dot(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status_1 = (hipblasDotFn)(handle, N, dx, incx, dy, incy, d_hipblas_result); - - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status_1; - } + CHECK_HIPBLAS_ERROR((hipblasDotFn)(handle, N, dx, incx, dy, incy, d_hipblas_result)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -163,8 +116,6 @@ hipblasStatus_t testing_dot(const Arguments& argus) hipblas_error_device); } - // BLAS_1_RESULT_PRINT - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_dot_batched.hpp b/clients/include/testing_dot_batched.hpp index 5eed829ef..0d034d714 100644 --- a/clients/include/testing_dot_batched.hpp +++ b/clients/include/testing_dot_batched.hpp @@ -26,11 +26,6 @@ hipblasStatus_t testing_dot_batched(const Arguments& argus) int incy = argus.incy; int batch_count = argus.batch_count; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx < 0 || incy < 0 || batch_count < 0) @@ -47,8 +42,7 @@ hipblasStatus_t testing_dot_batched(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice host_batch_vector hx(N, incx, batch_count); @@ -72,39 +66,25 @@ hipblasStatus_t testing_dot_batched(const Arguments& argus) HIPBLAS =================================================================== */ // hipblasDot accept both dev/host pointer for the scalar - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 = (hipblasDotBatchedFn)(handle, - N, - dx.ptr_on_device(), - incx, - dy.ptr_on_device(), - incy, - batch_count, - d_hipblas_result); - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 = (hipblasDotBatchedFn)(handle, - N, - dx.ptr_on_device(), - incx, - dy.ptr_on_device(), - incy, - batch_count, - h_hipblas_result1); - - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) - || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - if(status_3 != HIPBLAS_STATUS_SUCCESS) - return status_3; - if(status_4 != HIPBLAS_STATUS_SUCCESS) - return status_4; - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR((hipblasDotBatchedFn)(handle, + N, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + batch_count, + d_hipblas_result)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR((hipblasDotBatchedFn)(handle, + N, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + batch_count, + h_hipblas_result1)); CHECK_HIP_ERROR(hipMemcpy( h_hipblas_result2, d_hipblas_result, sizeof(T) * batch_count, hipMemcpyDeviceToHost)); @@ -137,16 +117,8 @@ hipblasStatus_t testing_dot_batched(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status_1 = hipblasGetStream(handle, &stream); - status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - if(status_1 != HIPBLAS_STATUS_SUCCESS || status_2 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -154,20 +126,14 @@ hipblasStatus_t testing_dot_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status_1 = (hipblasDotBatchedFn)(handle, - N, - dx.ptr_on_device(), - incx, - dy.ptr_on_device(), - incy, - batch_count, - d_hipblas_result); - - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status_1; - } + CHECK_HIPBLAS_ERROR((hipblasDotBatchedFn)(handle, + N, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + batch_count, + d_hipblas_result)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -180,7 +146,6 @@ hipblasStatus_t testing_dot_batched(const Arguments& argus) hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_dot_strided_batched.hpp b/clients/include/testing_dot_strided_batched.hpp index c71a3550e..2d365c542 100644 --- a/clients/include/testing_dot_strided_batched.hpp +++ b/clients/include/testing_dot_strided_batched.hpp @@ -33,11 +33,6 @@ hipblasStatus_t testing_dot_strided_batched(const Arguments& argus) int sizeX = stridex * batch_count; int sizeY = stridey * batch_count; - hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; - hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx < 0 || incy < 0 || batch_count < 0) @@ -62,8 +57,7 @@ hipblasStatus_t testing_dot_strided_batched(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -78,27 +72,13 @@ hipblasStatus_t testing_dot_strided_batched(const Arguments& argus) HIPBLAS =================================================================== */ // hipblasDot accept both dev/host pointer for the scalar - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 = (hipblasDotStridedBatchedFn)( - handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, d_hipblas_result); - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 = (hipblasDotStridedBatchedFn)( - handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, h_hipblas_result1); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR((hipblasDotStridedBatchedFn)( + handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, d_hipblas_result)); - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) - || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - if(status_3 != HIPBLAS_STATUS_SUCCESS) - return status_3; - if(status_4 != HIPBLAS_STATUS_SUCCESS) - return status_4; - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR((hipblasDotStridedBatchedFn)( + handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, h_hipblas_result1)); CHECK_HIP_ERROR(hipMemcpy( h_hipblas_result2, d_hipblas_result, sizeof(T) * batch_count, hipMemcpyDeviceToHost)); @@ -136,16 +116,8 @@ hipblasStatus_t testing_dot_strided_batched(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status_1 = hipblasGetStream(handle, &stream); - status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - if(status_1 != HIPBLAS_STATUS_SUCCESS || status_2 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - if(status_1 != HIPBLAS_STATUS_SUCCESS) - return status_1; - if(status_2 != HIPBLAS_STATUS_SUCCESS) - return status_2; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -153,14 +125,8 @@ hipblasStatus_t testing_dot_strided_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status_1 = (hipblasDotStridedBatchedFn)( - handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, d_hipblas_result); - - if(status_1 != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status_1; - } + CHECK_HIPBLAS_ERROR((hipblasDotStridedBatchedFn)( + handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, d_hipblas_result)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -174,7 +140,6 @@ hipblasStatus_t testing_dot_strided_batched(const Arguments& argus) hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_scal.hpp b/clients/include/testing_scal.hpp index 01cfca1ca..b9fd02783 100644 --- a/clients/include/testing_scal.hpp +++ b/clients/include/testing_scal.hpp @@ -24,35 +24,25 @@ hipblasStatus_t testing_scal(const Arguments& argus) int unit_check = argus.unit_check; int timing = argus.timing; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory - if(N < 0) - { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; - } - else if(incx < 0) + if(N < 0 || incx < 0) { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; + return HIPBLAS_STATUS_INVALID_VALUE; } int sizeX = N * incx; U alpha = argus.alpha; // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - vector hx(sizeX); - vector hz(sizeX); + host_vector hx(sizeX); + host_vector hz(sizeX); device_vector dx(sizeX); double gpu_time_used, cpu_time_used; double hipblas_error = 0.0; - hipblasHandle_t handle; - - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -65,21 +55,15 @@ hipblasStatus_t testing_scal(const Arguments& argus) CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * sizeX, hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - status = hipblasScalFn(handle, N, &alpha, dx, incx); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasScalFn(handle, N, &alpha, dx, incx)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hx.data(), dx, sizeof(T) * sizeX, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { - /* ===================================================================== CPU BLAS =================================================================== */ @@ -91,6 +75,10 @@ hipblasStatus_t testing_scal(const Arguments& argus) { unit_check_general(1, N, incx, hz.data(), hx.data()); } + if(argus.norm_check) + { + hipblas_error = norm_check_general('F', 1, N, incx, hz.data(), hx.data()); + } } // end of if unit check @@ -99,12 +87,7 @@ hipblasStatus_t testing_scal(const Arguments& argus) if(timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -112,13 +95,7 @@ hipblasStatus_t testing_scal(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status = hipblasScalFn(handle, N, &alpha, dx, incx); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasScalFn(handle, N, &alpha, dx, incx)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -130,6 +107,5 @@ hipblasStatus_t testing_scal(const Arguments& argus) hipblas_error); } - hipblasDestroy(handle); - return status; + return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_scal_batched.hpp b/clients/include/testing_scal_batched.hpp index 92dbaedcd..db7140073 100644 --- a/clients/include/testing_scal_batched.hpp +++ b/clients/include/testing_scal_batched.hpp @@ -24,10 +24,9 @@ hipblasStatus_t testing_scal_batched(const Arguments& argus) int incx = argus.incx; int batch_count = argus.batch_count; int unit_check = argus.unit_check; + int norm_check = argus.norm_check; int timing = argus.timing; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx < 0 || batch_count < 0) @@ -44,8 +43,7 @@ hipblasStatus_t testing_scal_batched(const Arguments& argus) double gpu_time_used = 0.0, cpu_time_used = 0.0; double hipblas_error = 0.0; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice host_batch_vector hx(N, incx, batch_count); @@ -65,17 +63,13 @@ hipblasStatus_t testing_scal_batched(const Arguments& argus) /* ===================================================================== HIPBLAS =================================================================== */ - status = hipblasScalBatchedFn(handle, N, &alpha, dx.ptr_on_device(), incx, batch_count); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR( + hipblasScalBatchedFn(handle, N, &alpha, dx.ptr_on_device(), incx, batch_count)); // copy output from device to CPU CHECK_HIP_ERROR(hx.transfer_from(dx)); - if(unit_check) + if(unit_check || norm_check) { /* ===================================================================== CPU BLAS @@ -91,18 +85,17 @@ hipblasStatus_t testing_scal_batched(const Arguments& argus) { unit_check_general(1, N, batch_count, incx, hz, hx); } + if(norm_check) + { + hipblas_error = norm_check_general('F', 1, N, incx, hz, hx, batch_count); + } } // end of if unit check if(timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -110,13 +103,8 @@ hipblasStatus_t testing_scal_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status = hipblasScalBatchedFn(handle, N, &alpha, dx.ptr_on_device(), incx, batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR( + hipblasScalBatchedFn(handle, N, &alpha, dx.ptr_on_device(), incx, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -127,6 +115,6 @@ hipblasStatus_t testing_scal_batched(const Arguments& argus) scal_gbyte_count(N), hipblas_error); } - hipblasDestroy(handle); - return status; + + return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_scal_strided_batched.hpp b/clients/include/testing_scal_strided_batched.hpp index e1baf79dc..7649eb800 100644 --- a/clients/include/testing_scal_strided_batched.hpp +++ b/clients/include/testing_scal_strided_batched.hpp @@ -32,8 +32,6 @@ hipblasStatus_t testing_scal_strided_batched(const Arguments& argus) U alpha = argus.alpha; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx < 0 || batch_count < 0) @@ -50,8 +48,7 @@ hipblasStatus_t testing_scal_strided_batched(const Arguments& argus) double gpu_time_used = 0.0, cpu_time_used = 0.0; double hipblas_error = 0.0; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -64,14 +61,10 @@ hipblasStatus_t testing_scal_strided_batched(const Arguments& argus) CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * sizeX, hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - status = hipblasScalStridedBatchedFn(handle, N, &alpha, dx, incx, stridex, batch_count); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR( + hipblasScalStridedBatchedFn(handle, N, &alpha, dx, incx, stridex, batch_count)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hx.data(), dx, sizeof(T) * sizeX, hipMemcpyDeviceToHost)); @@ -99,12 +92,7 @@ hipblasStatus_t testing_scal_strided_batched(const Arguments& argus) if(timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -112,13 +100,8 @@ hipblasStatus_t testing_scal_strided_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status = hipblasScalStridedBatchedFn(handle, N, &alpha, dx, incx, stridex, batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR( + hipblasScalStridedBatchedFn(handle, N, &alpha, dx, incx, stridex, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -131,6 +114,5 @@ hipblasStatus_t testing_scal_strided_batched(const Arguments& argus) hipblas_error); } - hipblasDestroy(handle); - return status; + return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_swap.hpp b/clients/include/testing_swap.hpp index 6d3534de8..3b6174fec 100644 --- a/clients/include/testing_swap.hpp +++ b/clients/include/testing_swap.hpp @@ -23,36 +23,24 @@ hipblasStatus_t testing_swap(const Arguments& argus) int incx = argus.incx; int incy = argus.incy; int unit_check = argus.unit_check; + int norm_check = argus.norm_check; int timing = argus.timing; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory - if(N < 0) - { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; - } - else if(incx < 0) + if(N < 0 || incx < 0 || incy < 0) { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; - } - else if(incy < 0) - { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; + return HIPBLAS_STATUS_INVALID_VALUE; } int sizeX = N * incx; int sizeY = N * incy; // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - vector hx(sizeX); - vector hy(sizeY); - vector hx_cpu(sizeX); - vector hy_cpu(sizeY); + host_vector hx(sizeX); + host_vector hy(sizeY); + host_vector hx_cpu(sizeX); + host_vector hy_cpu(sizeY); // allocate memory on device device_vector dx(sizeX); @@ -62,8 +50,7 @@ hipblasStatus_t testing_swap(const Arguments& argus) double gpu_time_used = 0.0, cpu_time_used = 0.0; double hipblas_error = 0.0; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -77,31 +64,31 @@ hipblasStatus_t testing_swap(const Arguments& argus) CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - status = hipblasSwapFn(handle, N, dx, incx, dy, incy); - - if((status != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasSwapFn(handle, N, dx, incx, dy, incy)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hx.data(), dx, sizeof(T) * sizeX, hipMemcpyDeviceToHost)); CHECK_HIP_ERROR(hipMemcpy(hy.data(), dy, sizeof(T) * sizeY, hipMemcpyDeviceToHost)); - if(unit_check) + if(unit_check || norm_check) { - /* ===================================================================== CPU BLAS =================================================================== */ cblas_swap(N, hx.data(), incx, hy.data(), incy); - if(argus.unit_check) + if(unit_check) { unit_check_general(1, N, incx, hx_cpu.data(), hx.data()); + unit_check_general(1, N, incy, hy_cpu.data(), hy.data()); + } + if(norm_check) + { + hipblas_error + = std::max(norm_check_general('F', 1, N, incx, hx_cpu.data(), hx.data()), + norm_check_general('F', 1, N, incy, hy_cpu.data(), hy.data())); } } // end of if unit/norm check @@ -109,12 +96,7 @@ hipblasStatus_t testing_swap(const Arguments& argus) if(timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -122,13 +104,7 @@ hipblasStatus_t testing_swap(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status = hipblasSwapFn(handle, N, dx, incx, dy, incy); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasSwapFn(handle, N, dx, incx, dy, incy)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -140,6 +116,5 @@ hipblasStatus_t testing_swap(const Arguments& argus) hipblas_error); } - hipblasDestroy(handle); - return status; + return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_swap_batched.hpp b/clients/include/testing_swap_batched.hpp index a1f883044..3735dc09e 100644 --- a/clients/include/testing_swap_batched.hpp +++ b/clients/include/testing_swap_batched.hpp @@ -20,14 +20,13 @@ hipblasStatus_t testing_swap_batched(const Arguments& argus) auto hipblasSwapBatchedFn = FORTRAN ? hipblasSwapBatched : hipblasSwapBatched; - int N = argus.N; - int incx = argus.incx; - int incy = argus.incy; - int batch_count = argus.batch_count; - int unit_check = argus.unit_check; - int norm_check = argus.norm_check; - int timing = argus.timing; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + int N = argus.N; + int incx = argus.incx; + int incy = argus.incy; + int batch_count = argus.batch_count; + int unit_check = argus.unit_check; + int norm_check = argus.norm_check; + int timing = argus.timing; // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -48,71 +47,36 @@ hipblasStatus_t testing_swap_batched(const Arguments& argus) double hipblas_error = 0.0; double gpu_time_used = 0.0; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - host_vector hx_array[batch_count]; - host_vector hy_array[batch_count]; - host_vector hx_cpu_array[batch_count]; - host_vector hy_cpu_array[batch_count]; + host_batch_vector hx(N, incx, batch_count); + host_batch_vector hy(N, incy, batch_count); + host_batch_vector hx_cpu(N, incx, batch_count); + host_batch_vector hy_cpu(N, incy, batch_count); - device_batch_vector bx_array(batch_count, sizeX); - device_batch_vector by_array(batch_count, sizeY); + device_batch_vector dx(N, incx, batch_count); + device_batch_vector dy(N, incy, batch_count); - device_vector dx_array(batch_count); - device_vector dy_array(batch_count); - - int last = batch_count - 1; - if(!dx_array || !dy_array || (!bx_array[last] && sizeX) || (!by_array[last] && sizeY)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy.memcheck()); // Initial Data on CPU - srand(1); - for(int b = 0; b < batch_count; b++) - { - hx_array[b] = host_vector(sizeX); - hy_array[b] = host_vector(sizeY); - hx_cpu_array[b] = host_vector(sizeX); - hy_cpu_array[b] = host_vector(sizeY); - - srand(1); - hipblas_init(hx_array[b], 1, N, incx); - hipblas_init(hy_array[b], 1, N, incy); - - hx_cpu_array[b] = hx_array[b]; - hy_cpu_array[b] = hy_array[b]; - - CHECK_HIP_ERROR( - hipMemcpy(bx_array[b], hx_array[b], sizeof(T) * sizeX, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR( - hipMemcpy(by_array[b], hy_array[b], sizeof(T) * sizeY, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR(hipMemcpy(dx_array, bx_array, batch_count * sizeof(T*), hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy_array, by_array, batch_count * sizeof(T*), hipMemcpyHostToDevice)); + hipblas_init(hx, true); + hipblas_init(hy, false); + hx_cpu.copy_from(hx); + hy_cpu.copy_from(hy); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - status = hipblasSwapBatchedFn(handle, N, dx_array, incx, dy_array, incy, batch_count); - - if((status != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasSwapBatchedFn( + handle, N, dx.ptr_on_device(), incx, dy.ptr_on_device(), incy, batch_count)); - for(int b = 0; b < batch_count; b++) - { - // copy output from device to CPU - CHECK_HIP_ERROR( - hipMemcpy(hx_array[b], bx_array[b], sizeof(T) * sizeX, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR( - hipMemcpy(hy_array[b], by_array[b], sizeof(T) * sizeY, hipMemcpyDeviceToHost)); - } + CHECK_HIP_ERROR(hx.transfer_from(dx)); + CHECK_HIP_ERROR(hy.transfer_from(dy)); if(unit_check || norm_check) { @@ -121,12 +85,19 @@ hipblasStatus_t testing_swap_batched(const Arguments& argus) =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_swap(N, hx_cpu_array[b], incx, hy_cpu_array[b], incy); + cblas_swap(N, hx_cpu[b], incx, hy_cpu[b], incy); } if(unit_check) { - unit_check_general(1, N, batch_count, incy, hy_cpu_array, hy_array); + unit_check_general(1, N, batch_count, incy, hy_cpu, hy); + unit_check_general(1, N, batch_count, incx, hx_cpu, hx); + } + if(norm_check) + { + hipblas_error + = std::max(norm_check_general('F', 1, N, incx, hx_cpu, hx, batch_count), + norm_check_general('F', 1, N, incy, hy_cpu, hy, batch_count)); } } // end of if unit/norm check @@ -134,12 +105,7 @@ hipblasStatus_t testing_swap_batched(const Arguments& argus) if(timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -147,13 +113,8 @@ hipblasStatus_t testing_swap_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status = hipblasSwapBatchedFn(handle, N, dx_array, incx, dy_array, incy, batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasSwapBatchedFn( + handle, N, dx.ptr_on_device(), incx, dy.ptr_on_device(), incy, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -164,6 +125,6 @@ hipblasStatus_t testing_swap_batched(const Arguments& argus) swap_gbyte_count(N), hipblas_error); } - hipblasDestroy(handle); - return status; + + return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_swap_strided_batched.hpp b/clients/include/testing_swap_strided_batched.hpp index 1dd454587..feb96bee7 100644 --- a/clients/include/testing_swap_strided_batched.hpp +++ b/clients/include/testing_swap_strided_batched.hpp @@ -34,8 +34,6 @@ hipblasStatus_t testing_swap_strided_batched(const Arguments& argus) int sizeX = stridex * batch_count; int sizeY = stridey * batch_count; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx < 0 || incy < 0 || batch_count < 0) @@ -57,8 +55,7 @@ hipblasStatus_t testing_swap_strided_batched(const Arguments& argus) double hipblas_error = 0.0; double gpu_time_used = 0.0; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -72,16 +69,10 @@ hipblasStatus_t testing_swap_strided_batched(const Arguments& argus) CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - status - = hipblasSwapStridedBatchedFn(handle, N, dx, incx, stridex, dy, incy, stridey, batch_count); - - if((status != HIPBLAS_STATUS_SUCCESS)) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR( + hipblasSwapStridedBatchedFn(handle, N, dx, incx, stridex, dy, incy, stridey, batch_count)); // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hx.data(), dx, sizeof(T) * sizeX, hipMemcpyDeviceToHost)); @@ -107,12 +98,7 @@ hipblasStatus_t testing_swap_strided_batched(const Arguments& argus) if(timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -120,14 +106,8 @@ hipblasStatus_t testing_swap_strided_batched(const Arguments& argus) if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); - status = hipblasSwapStridedBatchedFn( - handle, N, dx, incx, stridex, dy, incy, stridey, batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasSwapStridedBatchedFn( + handle, N, dx, incx, stridex, dy, incy, stridey, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -140,6 +120,5 @@ hipblasStatus_t testing_swap_strided_batched(const Arguments& argus) hipblas_error); } - hipblasDestroy(handle); - return status; + return HIPBLAS_STATUS_SUCCESS; } From aa26ecfb6373848859cf80801807979fb986860a Mon Sep 17 00:00:00 2001 From: Daine McNiven <51674140+daineAMD@users.noreply.github.com> Date: Thu, 18 Mar 2021 15:37:10 -0600 Subject: [PATCH 4/8] Updating gbmv, gemv, ger, hbmv (#321) --- clients/benchmarks/client.cpp | 62 +++-- clients/gtest/gbmv_batched_gtest.cpp | 6 +- clients/gtest/gbmv_strided_batched_gtest.cpp | 6 +- clients/gtest/gemv_batched_gtest.cpp | 6 +- clients/gtest/gemv_strided_batched_gtest.cpp | 6 +- clients/include/bytes.hpp | 22 ++ clients/include/testing_gbmv.hpp | 119 ++++++--- clients/include/testing_gbmv_batched.hpp | 249 +++++++++--------- .../include/testing_gbmv_strided_batched.hpp | 180 +++++++++---- clients/include/testing_gemv.hpp | 90 +++---- clients/include/testing_gemv_batched.hpp | 167 ++++++------ .../include/testing_gemv_strided_batched.hpp | 85 ++++-- clients/include/testing_ger.hpp | 118 +++++---- clients/include/testing_ger_batched.hpp | 191 ++++++++------ .../include/testing_ger_strided_batched.hpp | 165 ++++++++---- clients/include/testing_hbmv.hpp | 98 ++++--- clients/include/testing_hbmv_batched.hpp | 219 ++++++++------- .../include/testing_hbmv_strided_batched.hpp | 177 +++++++++---- 18 files changed, 1171 insertions(+), 795 deletions(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 945254c67..49d12e2e7 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -318,6 +318,7 @@ struct perf_blas{} || std::is_same void operator()(const Arguments& arg) { static const func_map fmap = { + // L1 {"asum", testing_asum}, {"asum_batched", testing_asum_batched}, {"asum_strided_batched", testing_asum_strided_batched}, @@ -345,6 +346,17 @@ struct perf_blas{} || std::is_same {"scal", testing_scal}, {"scal_batched", testing_scal_batched}, {"scal_strided_batched", testing_scal_strided_batched}, + + // L2 + {"gbmv", testing_gbmv}, + {"gbmv_batched", testing_gbmv_batched}, + {"gbmv_strided_batched", testing_gbmv_strided_batched}, + {"gemv", testing_gemv}, + {"gemv_batched", testing_gemv_batched}, + {"gemv_strided_batched", testing_gemv_strided_batched}, + {"ger", testing_ger}, + {"ger_batched", testing_ger_batched}, + {"ger_strided_batched", testing_ger_strided_batched}, /*{"set_get_vector", testing_set_get_vector}, {"set_get_matrix", testing_set_get_matrix}, {"set_get_matrix_async", testing_set_get_matrix_async}, @@ -356,20 +368,6 @@ struct perf_blas{} || std::is_same {"rotmg_batched", testing_rotmg_batched}, {"rotmg_strided_batched", testing_rotmg_strided_batched}, // L2 - {"gbmv", testing_gbmv}, - {"gbmv_batched", testing_gbmv_batched}, - {"gbmv_strided_batched", testing_gbmv_strided_batched}, - {"geam", testing_geam}, - {"geam_batched", testing_geam_batched}, - {"geam_strided_batched", testing_geam_strided_batched}, - */ - {"gemv", testing_gemv}, - {"gemv_batched", testing_gemvBatched}, - {"gemv_strided_batched", testing_gemvStridedBatched}, - /* - {"ger", testing_ger}, - {"ger_batched", testing_ger_batched}, - {"ger_strided_batched", testing_ger_strided_batched}, {"spr", testing_spr}, {"spr_batched", testing_spr_batched}, {"spr_strided_batched", testing_spr_strided_batched}, @@ -401,6 +399,9 @@ struct perf_blas{} || std::is_same {"trmv_batched", testing_trmv_batched}, {"trmv_strided_batched", testing_trmv_strided_batched}, // L3 + {"geam", testing_geam}, + {"geam_batched", testing_geam_batched}, + {"geam_strided_batched", testing_geam_strided_batched}, {"dgmm", testing_dgmm}, {"dgmm_batched", testing_dgmm_batched}, {"dgmm_strided_batched", testing_dgmm_strided_batched}, @@ -493,6 +494,7 @@ struct perf_blas< void operator()(const Arguments& arg) { static const func_map map = { + // L1 {"asum", testing_asum}, {"asum_batched", testing_asum_batched}, {"asum_strided_batched", testing_asum_strided_batched}, @@ -523,23 +525,25 @@ struct perf_blas< {"scal", testing_scal}, {"scal_batched", testing_scal_batched}, {"scal_strided_batched", testing_scal_strided_batched}, + + // L2 + {"gemv", testing_gemv}, + {"gemv_batched", testing_gemv_batched}, + {"gemv_strided_batched", testing_gemv_strided_batched}, + {"gbmv", testing_gbmv}, + {"gbmv_batched", testing_gbmv_batched}, + {"gbmv_strided_batched", testing_gbmv_strided_batched}, + {"geru", testing_ger}, + {"geru_batched", testing_ger_batched}, + {"geru_strided_batched", testing_ger_strided_batched}, + {"gerc", testing_ger}, + {"gerc_batched", testing_ger_batched}, + {"gerc_strided_batched", testing_ger_strided_batched}, + {"hbmv", testing_hbmv}, + {"hbmv_batched", testing_hbmv_batched}, + {"hbmv_strided_batched", testing_hbmv_strided_batched}, /* // L2 - {"gbmv", testing_gbmv}, - {"gbmv_batched", testing_gbmv_batched}, - {"gbmv_strided_batched", testing_gbmv_strided_batched}, - {"gemv", testing_gemv}, - {"gemv_batched", testing_gemv_batched}, - {"gemv_strided_batched", testing_gemv_strided_batched}, - {"geru", testing_ger}, - {"geru_batched", testing_ger_batched}, - {"geru_strided_batched", testing_ger_strided_batched}, - {"gerc", testing_ger}, - {"gerc_batched", testing_ger_batched}, - {"gerc_strided_batched", testing_ger_strided_batched}, - {"hbmv", testing_hbmv}, - {"hbmv_batched", testing_hbmv_batched}, - {"hbmv_strided_batched", testing_hbmv_strided_batched}, {"hemv", testing_hemv}, {"hemv_batched", testing_hemv_batched}, {"hemv_strided_batched", testing_hemv_strided_batched}, diff --git a/clients/gtest/gbmv_batched_gtest.cpp b/clients/gtest/gbmv_batched_gtest.cpp index 2d6c8d793..37c5eeb4b 100644 --- a/clients/gtest/gbmv_batched_gtest.cpp +++ b/clients/gtest/gbmv_batched_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -148,7 +148,7 @@ TEST_P(gbmv_gtest_batched, gbmv_gtest_float) { Arguments arg = setup_gbmv_arguments(GetParam()); - hipblasStatus_t status = testing_gbmvBatched(arg); + hipblasStatus_t status = testing_gbmv_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -169,7 +169,7 @@ TEST_P(gbmv_gtest_batched, gbmv_gtest_float_complex) { Arguments arg = setup_gbmv_arguments(GetParam()); - hipblasStatus_t status = testing_gbmvBatched(arg); + hipblasStatus_t status = testing_gbmv_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) diff --git a/clients/gtest/gbmv_strided_batched_gtest.cpp b/clients/gtest/gbmv_strided_batched_gtest.cpp index 9e0e8906a..b0b8fd447 100644 --- a/clients/gtest/gbmv_strided_batched_gtest.cpp +++ b/clients/gtest/gbmv_strided_batched_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -159,7 +159,7 @@ TEST_P(gbmv_gtest_strided_batched, gbmv_gtest_float) { Arguments arg = setup_gbmv_arguments(GetParam()); - hipblasStatus_t status = testing_gbmvStridedBatched(arg); + hipblasStatus_t status = testing_gbmv_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -182,7 +182,7 @@ TEST_P(gbmv_gtest_strided_batched, gbmv_gtest_float_complex) { Arguments arg = setup_gbmv_arguments(GetParam()); - hipblasStatus_t status = testing_gbmvStridedBatched(arg); + hipblasStatus_t status = testing_gbmv_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) diff --git a/clients/gtest/gemv_batched_gtest.cpp b/clients/gtest/gemv_batched_gtest.cpp index fe643ca8b..debf6bc15 100644 --- a/clients/gtest/gemv_batched_gtest.cpp +++ b/clients/gtest/gemv_batched_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -146,7 +146,7 @@ TEST_P(gemv_gtest_batched, gemv_gtest_float) { Arguments arg = setup_gemv_arguments(GetParam()); - hipblasStatus_t status = testing_gemvBatched(arg); + hipblasStatus_t status = testing_gemv_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -178,7 +178,7 @@ TEST_P(gemv_gtest_batched, gemv_gtest_float_complex) { Arguments arg = setup_gemv_arguments(GetParam()); - hipblasStatus_t status = testing_gemvBatched(arg); + hipblasStatus_t status = testing_gemv_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) diff --git a/clients/gtest/gemv_strided_batched_gtest.cpp b/clients/gtest/gemv_strided_batched_gtest.cpp index e6f25ce7d..b0b5d26d9 100644 --- a/clients/gtest/gemv_strided_batched_gtest.cpp +++ b/clients/gtest/gemv_strided_batched_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -157,7 +157,7 @@ TEST_P(gemv_gtest_strided_batched, gemv_gtest_float) { Arguments arg = setup_gemv_arguments(GetParam()); - hipblasStatus_t status = testing_gemvStridedBatched(arg); + hipblasStatus_t status = testing_gemv_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -195,7 +195,7 @@ TEST_P(gemv_gtest_strided_batched, gemv_gtest_float_complex) { Arguments arg = setup_gemv_arguments(GetParam()); - hipblasStatus_t status = testing_gemvStridedBatched(arg); + hipblasStatus_t status = testing_gemv_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) diff --git a/clients/include/bytes.hpp b/clients/include/bytes.hpp index 1333ba4da..364ac3b28 100644 --- a/clients/include/bytes.hpp +++ b/clients/include/bytes.hpp @@ -107,6 +107,20 @@ constexpr double gemv_gbyte_count(hipblasOperation_t transA, int m, int n) return (sizeof(T) * (m * n + 2 * (transA == HIPBLAS_OP_N ? n : m))) / 1e9; } +/* \brief byte counts of GBMV */ +template +constexpr double gbmv_gbyte_count(hipblasOperation_t transA, int m, int n, int kl, int ku) +{ + size_t dim_x = transA == HIPBLAS_OP_N ? n : m; + + int k1 = dim_x < kl ? dim_x : kl; + int k2 = dim_x < ku ? dim_x : ku; + int d1 = ((k1 * dim_x) - (k1 * (k1 + 1) / 2)); + int d2 = ((k2 * dim_x) - (k2 * (k2 + 1) / 2)); + double num_els = double(d1 + d2 + dim_x); + return (sizeof(T) * (num_els)) / 1e9; +} + /* \brief byte counts of GER */ template constexpr double ger_gbyte_count(int m, int n) @@ -114,6 +128,14 @@ constexpr double ger_gbyte_count(int m, int n) return (sizeof(T) * (m * n + m + n)) / 1e9; } +/* \brief byte counts of HBMV */ +template +constexpr double hbmv_gbyte_count(int n, int k) +{ + int k1 = k < n ? k : n; + return (sizeof(T) * (n * k1 - ((k1 * (k1 + 1)) / 2.0) + 3 * n)) / 1e9; +} + /* \brief byte counts of HPR */ template constexpr double hpr_gbyte_count(int n) diff --git a/clients/include/testing_gbmv.hpp b/clients/include/testing_gbmv.hpp index a0d8363db..6b05e54d1 100644 --- a/clients/include/testing_gbmv.hpp +++ b/clients/include/testing_gbmv.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -33,7 +33,7 @@ hipblasStatus_t testing_gbmv(const Arguments& argus) int Y_size; hipblasOperation_t transA = char2hipblas_operation(argus.transA_option); - // transA = HIPBLAS_OP_T; + if(transA == HIPBLAS_OP_N) { X_size = N; @@ -45,35 +45,33 @@ hipblasStatus_t testing_gbmv(const Arguments& argus) Y_size = N; } - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(M < 0 || N < 0 || KL < 0 || KU < 0 || lda < KL + KU + 1 || incx == 0 || incy == 0) { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; + return HIPBLAS_STATUS_INVALID_VALUE; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); host_vector hx(X_size * incx); host_vector hy(Y_size * incy); - host_vector hz(Y_size * incy); + host_vector hy_host(Y_size * incy); + host_vector hy_device(Y_size * incy); + host_vector hy_cpu(Y_size * incy); device_vector dA(A_size); device_vector dx(X_size * incx); device_vector dy(Y_size * incy); + device_vector d_alpha(1); + device_vector d_beta(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = (T)argus.alpha; - T beta = (T)argus.beta; + T h_alpha = (T)argus.alpha; + T h_beta = (T)argus.beta; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -81,50 +79,97 @@ hipblasStatus_t testing_gbmv(const Arguments& argus) hipblas_init(hx, 1, X_size, incx); hipblas_init(hy, 1, Y_size, incy); - // copy vector is easy in STL; hz = hy: save a copy in hz which will be output of CPU BLAS - hz = hy; + // copy vector is easy in STL; hy_cpu = hy: save a copy in hy_cpu which will be output of CPU BLAS + hy_cpu = hy; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * X_size * incx, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * Y_size * incy, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * X_size * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size * incy, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasGbmvFn( + handle, transA, M, N, KL, KU, (T*)&h_alpha, dA, lda, dx, incx, (T*)&h_beta, dy, incy)); - status = hipblasGbmvFn( - handle, transA, M, N, KL, KU, (T*)&alpha, dA, lda, dx, incx, (T*)&beta, dy, incy); + CHECK_HIP_ERROR( + hipMemcpy(hy_host.data(), dy, sizeof(T) * Y_size * incy, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size * incy, hipMemcpyHostToDevice)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR( + hipblasGbmvFn(handle, transA, M, N, KL, KU, d_alpha, dA, lda, dx, incx, d_beta, dy, incy)); - // copy output from device to CPU - hipMemcpy(hy.data(), dy, sizeof(T) * Y_size * incy, hipMemcpyDeviceToHost); + CHECK_HIP_ERROR( + hipMemcpy(hy_device.data(), dy, sizeof(T) * Y_size * incy, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ - cblas_gbmv( - transA, M, N, KL, KU, alpha, hA.data(), lda, hx.data(), incx, beta, hz.data(), incy); + cblas_gbmv(transA, + M, + N, + KL, + KU, + h_alpha, + hA.data(), + lda, + hx.data(), + incx, + h_beta, + hy_cpu.data(), + incy); // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, Y_size, incy, hz, hy); + unit_check_general(1, Y_size, incy, hy_cpu, hy_host); + unit_check_general(1, Y_size, incy, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, Y_size, incy, hy_cpu.data(), hy_host.data()); + hipblas_error_device + = norm_check_general('F', 1, Y_size, incy, hy_cpu.data(), hy_device.data()); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size * incy, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasGbmvFn( + handle, transA, M, N, KL, KU, d_alpha, dA, lda, dx, incx, d_beta, dy, incy)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + gbmv_gflop_count(transA, M, N, KL, KU), + gbmv_gbyte_count(transA, M, N, KL, KU), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_gbmv_batched.hpp b/clients/include/testing_gbmv_batched.hpp index a9b599587..3d5eec6e2 100644 --- a/clients/include/testing_gbmv_batched.hpp +++ b/clients/include/testing_gbmv_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -15,7 +15,7 @@ using namespace std; /* ============================================================================================ */ template -hipblasStatus_t testing_gbmvBatched(const Arguments& argus) +hipblasStatus_t testing_gbmv_batched(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasGbmvBatchedFn @@ -32,25 +32,21 @@ hipblasStatus_t testing_gbmvBatched(const Arguments& argus) int A_size = lda * N; int X_size; int Y_size; - int X_els; - int Y_els; int batch_count = argus.batch_count; hipblasOperation_t transA = char2hipblas_operation(argus.transA_option); - // transA = HIPBLAS_OP_T; + if(transA == HIPBLAS_OP_N) { - X_els = N; - Y_els = M; + X_size = N; + Y_size = M; } else { - X_els = M; - Y_els = N; + X_size = M; + Y_size = N; } - X_size = X_els * incx; - Y_size = Y_els * incy; hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; @@ -66,113 +62,87 @@ hipblasStatus_t testing_gbmvBatched(const Arguments& argus) return HIPBLAS_STATUS_SUCCESS; } - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = (T)argus.alpha; - T beta = (T)argus.beta; + T h_alpha = (T)argus.alpha; + T h_beta = (T)argus.beta; // arrays of pointers-to-host on host - host_vector hA_array[batch_count]; - host_vector hx_array[batch_count]; - host_vector hy_array[batch_count]; - host_vector hz_array[batch_count]; + host_batch_vector hA(A_size, 1, batch_count); + host_batch_vector hx(X_size, incx, batch_count); + host_batch_vector hy(Y_size, incy, batch_count); + host_batch_vector hy_host(Y_size, incy, batch_count); + host_batch_vector hy_device(Y_size, incy, batch_count); + host_batch_vector hy_cpu(Y_size, incy, batch_count); // arrays of pointers-to-device on host - device_batch_vector bA_array(batch_count, A_size); - device_batch_vector bx_array(batch_count, X_size); - device_batch_vector by_array(batch_count, Y_size); - - // arrays of pointers-to-device on device - device_vector dA_array(batch_count); - device_vector dx_array(batch_count); - device_vector dy_array(batch_count); - - int last = batch_count - 1; - if(!dA_array || !dx_array || !dy_array || (!bA_array[last] && A_size) - || (!bx_array[last] && X_size) || (!by_array[last] && Y_size)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } + device_batch_vector dA(A_size, 1, batch_count); + device_batch_vector dx(X_size, incx, batch_count); + device_batch_vector dy(Y_size, incy, batch_count); + device_vector d_alpha(1); + device_vector d_beta(1); + + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy.memcheck()); // Initial Data on CPU - hipError_t err_A, err_x, err_y; - srand(1); - for(int b = 0; b < batch_count; b++) - { - hA_array[b] = host_vector(A_size); - hx_array[b] = host_vector(X_size); - hy_array[b] = host_vector(Y_size); - hz_array[b] = host_vector(Y_size); - - // initialize matrices on host - srand(1); - hipblas_init(hA_array[b], M, N, lda); - hipblas_init(hx_array[b], 1, X_els, incx); - hipblas_init(hy_array[b], 1, Y_els, incy); - - hz_array[b] = hy_array[b]; - err_A = hipMemcpy(bA_array[b], hA_array[b], sizeof(T) * A_size, hipMemcpyHostToDevice); - err_x = hipMemcpy(bx_array[b], hx_array[b], sizeof(T) * X_size, hipMemcpyHostToDevice); - err_y = hipMemcpy(by_array[b], hy_array[b], sizeof(T) * Y_size, hipMemcpyHostToDevice); - - if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_MAPPING_ERROR; - } - } + hipblas_init(hA, true); + hipblas_init(hx); + hipblas_init(hy); + hy_cpu.copy_from(hy); - err_A = hipMemcpy(dA_array, bA_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - err_x = hipMemcpy(dx_array, bx_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - err_y = hipMemcpy(dy_array, by_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_MAPPING_ERROR; - } + CHECK_HIP_ERROR(dA.transfer_from(hA)); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { - status = hipblasGbmvBatchedFn(handle, - transA, - M, - N, - KL, - KU, - (T*)&alpha, - dA_array, - lda, - dx_array, - incx, - (T*)&beta, - dy_array, - incy, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - hipMemcpy(hy_array[b], by_array[b], sizeof(T) * Y_size, hipMemcpyDeviceToHost); - } - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasGbmvBatchedFn(handle, + transA, + M, + N, + KL, + KU, + (T*)&h_alpha, + dA.ptr_on_device(), + lda, + dx.ptr_on_device(), + incx, + (T*)&h_beta, + dy.ptr_on_device(), + incy, + batch_count)); + + CHECK_HIP_ERROR(hy_host.transfer_from(dy)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasGbmvBatchedFn(handle, + transA, + M, + N, + KL, + KU, + d_alpha, + dA.ptr_on_device(), + lda, + dx.ptr_on_device(), + incx, + d_beta, + dy.ptr_on_device(), + incy, + batch_count)); + + CHECK_HIP_ERROR(hy_device.transfer_from(dy)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -180,29 +150,66 @@ hipblasStatus_t testing_gbmvBatched(const Arguments& argus) for(int b = 0; b < batch_count; b++) { - cblas_gbmv(transA, - M, - N, - KL, - KU, - alpha, - hA_array[b], - lda, - hx_array[b], - incx, - beta, - hz_array[b], - incy); + cblas_gbmv( + transA, M, N, KL, KU, h_alpha, hA[b], lda, hx[b], incx, h_beta, hy_cpu[b], incy); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, Y_size, batch_count, incy, hz_array, hy_array); + unit_check_general(1, Y_size, batch_count, incy, hy_cpu, hy_host); + unit_check_general(1, Y_size, batch_count, incy, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, Y_size, incy, hy_cpu, hy_host, batch_count); + hipblas_error_device + = norm_check_general('F', 1, Y_size, incy, hy_cpu, hy_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(dy.transfer_from(hy)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasGbmvBatchedFn(handle, + transA, + M, + N, + KL, + KU, + d_alpha, + dA.ptr_on_device(), + lda, + dx.ptr_on_device(), + incx, + d_beta, + dy.ptr_on_device(), + incy, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + gbmv_gflop_count(transA, M, N, KL, KU), + gbmv_gbyte_count(transA, M, N, KL, KU), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_gbmv_strided_batched.hpp b/clients/include/testing_gbmv_strided_batched.hpp index 32f646305..7124a4574 100644 --- a/clients/include/testing_gbmv_strided_batched.hpp +++ b/clients/include/testing_gbmv_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -15,7 +15,7 @@ using namespace std; /* ============================================================================================ */ template -hipblasStatus_t testing_gbmvStridedBatched(const Arguments& argus) +hipblasStatus_t testing_gbmv_strided_batched(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasGbmvStridedBatchedFn @@ -43,7 +43,7 @@ hipblasStatus_t testing_gbmvStridedBatched(const Arguments& argus) int y_els; hipblasOperation_t transA = char2hipblas_operation(argus.transA_option); - // transA = HIPBLAS_OP_T; + if(transA == HIPBLAS_OP_N) { x_els = N; @@ -74,21 +74,22 @@ hipblasStatus_t testing_gbmvStridedBatched(const Arguments& argus) host_vector hA(A_size); host_vector hx(X_size); host_vector hy(Y_size); - host_vector hz(Y_size); + host_vector hy_host(Y_size); + host_vector hy_device(Y_size); + host_vector hy_cpu(Y_size); device_vector dA(A_size); device_vector dx(X_size); device_vector dy(Y_size); + device_vector d_alpha(1); + device_vector d_beta(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = (T)argus.alpha; - T beta = (T)argus.beta; + T h_alpha = (T)argus.alpha; + T h_beta = (T)argus.beta; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -97,47 +98,62 @@ hipblasStatus_t testing_gbmvStridedBatched(const Arguments& argus) hipblas_init(hy, 1, y_els, incy, stride_y, batch_count); // copy vector is easy in STL; hz = hy: save a copy in hz which will be output of CPU BLAS - hz = hy; + hy_cpu = hy; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * X_size, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * X_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { - status = hipblasGbmvStridedBatchedFn(handle, - transA, - M, - N, - KL, - KU, - (T*)&alpha, - dA, - lda, - stride_A, - dx, - incx, - stride_x, - (T*)&beta, - dy, - incy, - stride_y, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - hipMemcpy(hy.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasGbmvStridedBatchedFn(handle, + transA, + M, + N, + KL, + KU, + (T*)&h_alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + (T*)&h_beta, + dy, + incy, + stride_y, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasGbmvStridedBatchedFn(handle, + transA, + M, + N, + KL, + KU, + d_alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + d_beta, + dy, + incy, + stride_y, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hy_device.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); if(argus.unit_check) { @@ -152,13 +168,13 @@ hipblasStatus_t testing_gbmvStridedBatched(const Arguments& argus) N, KL, KU, - alpha, + h_alpha, hA.data() + b * stride_A, lda, hx.data() + b * stride_x, incx, - beta, - hz.data() + b * stride_y, + h_beta, + hy_cpu.data() + b * stride_y, incy); } @@ -166,10 +182,70 @@ hipblasStatus_t testing_gbmvStridedBatched(const Arguments& argus) // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, y_els, batch_count, incy, stride_y, hz, hy); + unit_check_general(1, y_els, batch_count, incy, stride_y, hy_cpu, hy_host); + unit_check_general(1, y_els, batch_count, incy, stride_y, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general( + 'F', 1, y_els, incy, stride_y, hy_cpu, hy_host, batch_count); + hipblas_error_device = norm_check_general( + 'F', 1, y_els, incy, stride_y, hy_cpu, hy_device, batch_count); + } + } + + if(argus.timing) + { + hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasGbmvStridedBatchedFn(handle, + transA, + M, + N, + KL, + KU, + d_alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + d_beta, + dy, + incy, + stride_y, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + gbmv_gflop_count(transA, M, N, KL, KU), + gbmv_gbyte_count(transA, M, N, KL, KU), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_gemv.hpp b/clients/include/testing_gemv.hpp index 0983e289c..a97f4aaad 100644 --- a/clients/include/testing_gemv.hpp +++ b/clients/include/testing_gemv.hpp @@ -31,7 +31,7 @@ hipblasStatus_t testing_gemv(const Arguments& argus) int Y_size; hipblasOperation_t transA = char2hipblas_operation(argus.transA_option); - // transA = HIPBLAS_OP_T; + if(transA == HIPBLAS_OP_N) { X_size = N; @@ -43,52 +43,31 @@ hipblasStatus_t testing_gemv(const Arguments& argus) Y_size = N; } - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory - if(M < 0) - { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; - } - if(N < 0) - { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; - } - else if(lda < 0) - { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; - } - else if(incx <= 0) - { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; - } - else if(incy <= 0) + if(M < 0 || N < 0 || lda < 0 || incx <= 0 || incy <= 0) { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; + return HIPBLAS_STATUS_INVALID_VALUE; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); host_vector hx(X_size * incx); host_vector hy(Y_size * incy); - host_vector hz(Y_size * incy); + host_vector hy_cpu(Y_size * incy); + host_vector hy_host(Y_size * incy); + host_vector hy_device(Y_size * incy); device_vector dA(A_size); device_vector dx(X_size * incx); device_vector dy(Y_size * incy); + device_vector d_alpha(1); + device_vector d_beta(1); - double gpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = (T)argus.alpha; - T beta = (T)argus.beta; + T h_alpha = (T)argus.alpha; + T h_beta = (T)argus.beta; hipblasLocalHandle handle(argus); @@ -99,40 +78,54 @@ hipblasStatus_t testing_gemv(const Arguments& argus) hipblas_init(hy, 1, Y_size, incy); // copy vector is easy in STL; hz = hy: save a copy in hz which will be output of CPU BLAS - hz = hy; + hy_cpu = hy; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * X_size * incx, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * Y_size * incy, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * X_size * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size * incy, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ if(argus.unit_check || argus.norm_check) { + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); CHECK_HIPBLAS_ERROR(hipblasGemvFn( - handle, transA, M, N, (T*)&alpha, dA, lda, dx, incx, (T*)&beta, dy, incy)); + handle, transA, M, N, (T*)&h_alpha, dA, lda, dx, incx, (T*)&h_beta, dy, incy)); + + CHECK_HIP_ERROR( + hipMemcpy(hy_host.data(), dy, sizeof(T) * Y_size * incy, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size * incy, hipMemcpyHostToDevice)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR( + hipblasGemvFn(handle, transA, M, N, d_alpha, dA, lda, dx, incx, d_beta, dy, incy)); + + CHECK_HIP_ERROR( + hipMemcpy(hy_device.data(), dy, sizeof(T) * Y_size * incy, hipMemcpyDeviceToHost)); /* ===================================================================== CPU BLAS =================================================================== */ - cblas_gemv(transA, M, N, alpha, hA.data(), lda, hx.data(), incx, beta, hz.data(), incy); - - // copy output from device to CPU - hipMemcpy(hy.data(), dy, sizeof(T) * Y_size * incy, hipMemcpyDeviceToHost); + cblas_gemv( + transA, M, N, h_alpha, hA.data(), lda, hx.data(), incx, h_beta, hy_cpu.data(), incy); // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, Y_size, incy, hz, hy); + unit_check_general(1, Y_size, incy, hy_cpu, hy_host); + unit_check_general(1, Y_size, incy, hy_cpu, hy_device); } if(argus.norm_check) { - rocblas_error = norm_check_general('F', 1, Y_size, incy, hz, hy); + hipblas_error_host = norm_check_general('F', 1, Y_size, incy, hy_cpu, hy_host); + hipblas_error_device = norm_check_general('F', 1, Y_size, incy, hy_cpu, hy_device); } } @@ -140,6 +133,8 @@ hipblasStatus_t testing_gemv(const Arguments& argus) { hipStream_t stream; CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + hipMemcpy(dy, hy.data(), sizeof(T) * Y_size * incy, hipMemcpyHostToDevice); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) @@ -149,8 +144,8 @@ hipblasStatus_t testing_gemv(const Arguments& argus) gpu_time_used = get_time_us_sync(stream); } - CHECK_HIPBLAS_ERROR(hipblasGemvFn( - handle, transA, M, N, (T*)&alpha, dA, lda, dx, incx, (T*)&beta, dy, incy)); + CHECK_HIPBLAS_ERROR( + hipblasGemvFn(handle, transA, M, N, d_alpha, dA, lda, dx, incx, d_beta, dy, incy)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -160,7 +155,8 @@ hipblasStatus_t testing_gemv(const Arguments& argus) gpu_time_used, gemv_gflop_count(transA, M, N), gemv_gbyte_count(transA, M, N), - rocblas_error); + hipblas_error_host, + hipblas_error_device); } return HIPBLAS_STATUS_SUCCESS; diff --git a/clients/include/testing_gemv_batched.hpp b/clients/include/testing_gemv_batched.hpp index a436e659b..20bf2b617 100644 --- a/clients/include/testing_gemv_batched.hpp +++ b/clients/include/testing_gemv_batched.hpp @@ -15,7 +15,7 @@ using namespace std; /* ============================================================================================ */ template -hipblasStatus_t testing_gemvBatched(const Arguments& argus) +hipblasStatus_t testing_gemv_batched(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasGemvBatchedFn @@ -30,25 +30,21 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) int A_size = lda * N; int X_size; int Y_size; - int X_els; - int Y_els; int batch_count = argus.batch_count; hipblasOperation_t transA = char2hipblas_operation(argus.transA_option); - // transA = HIPBLAS_OP_T; + if(transA == HIPBLAS_OP_N) { - X_els = N; - Y_els = M; + X_size = N; + Y_size = M; } else { - X_els = M; - Y_els = N; + X_size = M; + Y_size = N; } - X_size = X_els * incx; - Y_size = Y_els * incy; hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; @@ -65,121 +61,111 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) hipblasLocalHandle handle(argus); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = (T)argus.alpha; - T beta = (T)argus.beta; + T h_alpha = (T)argus.alpha; + T h_beta = (T)argus.beta; // arrays of pointers-to-host on host - host_vector hA_array[batch_count]; - host_vector hx_array[batch_count]; - host_vector hy_array[batch_count]; - host_vector hz_array[batch_count]; - - // arrays of pointers-to-device on host - device_batch_vector bA_array(batch_count, A_size); - device_batch_vector bx_array(batch_count, X_size); - device_batch_vector by_array(batch_count, Y_size); - - // arrays of pointers-to-device on device - device_vector dA_array(batch_count); - device_vector dx_array(batch_count); - device_vector dy_array(batch_count); - - int last = batch_count - 1; - if(!dA_array || !dx_array || !dy_array || (!bA_array[last] && A_size) - || (!bx_array[last] && X_size) || (!by_array[last] && Y_size)) - { - return HIPBLAS_STATUS_ALLOC_FAILED; - } + host_batch_vector hA(A_size, 1, batch_count); + host_batch_vector hx(X_size, incx, batch_count); + host_batch_vector hy(Y_size, incy, batch_count); + host_batch_vector hy_cpu(Y_size, incy, batch_count); + host_batch_vector hy_host(Y_size, incy, batch_count); + host_batch_vector hy_device(Y_size, incy, batch_count); + + // device pointers + device_batch_vector dA(A_size, 1, batch_count); + device_batch_vector dx(X_size, incx, batch_count); + device_batch_vector dy(Y_size, incy, batch_count); + device_vector d_alpha(1); + device_vector d_beta(1); + + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy.memcheck()); // Initial Data on CPU - hipError_t err_A, err_x, err_y; - srand(1); - for(int b = 0; b < batch_count; b++) - { - hA_array[b] = host_vector(A_size); - hx_array[b] = host_vector(X_size); - hy_array[b] = host_vector(Y_size); - hz_array[b] = host_vector(Y_size); - - // initialize matrices on host - srand(1); - hipblas_init(hA_array[b], M, N, lda); - hipblas_init(hx_array[b], 1, X_els, incx); - hipblas_init(hy_array[b], 1, Y_els, incy); - - hz_array[b] = hy_array[b]; - err_A = hipMemcpy(bA_array[b], hA_array[b], sizeof(T) * A_size, hipMemcpyHostToDevice); - err_x = hipMemcpy(bx_array[b], hx_array[b], sizeof(T) * X_size, hipMemcpyHostToDevice); - err_y = hipMemcpy(by_array[b], hy_array[b], sizeof(T) * Y_size, hipMemcpyHostToDevice); - - if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) - { - return HIPBLAS_STATUS_MAPPING_ERROR; - } - } + hipblas_init(hA, true); + hipblas_init(hx); + hipblas_init(hy); + hy_cpu.copy_from(hy); - err_A = hipMemcpy(dA_array, bA_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - err_x = hipMemcpy(dx_array, bx_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - err_y = hipMemcpy(dy_array, by_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) - { - return HIPBLAS_STATUS_MAPPING_ERROR; - } + CHECK_HIP_ERROR(dA.transfer_from(hA)); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== HIPBLAS =================================================================== */ if(argus.unit_check || argus.norm_check) { + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasGemvBatchedFn(handle, + transA, + M, + N, + (T*)&h_alpha, + dA.ptr_on_device(), + lda, + dx.ptr_on_device(), + incx, + (T*)&h_beta, + dy.ptr_on_device(), + incy, + batch_count)); + + CHECK_HIP_ERROR(hy_host.transfer_from(dy)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); CHECK_HIPBLAS_ERROR(hipblasGemvBatchedFn(handle, transA, M, N, - (T*)&alpha, - dA_array, + d_alpha, + dA.ptr_on_device(), lda, - dx_array, + dx.ptr_on_device(), incx, - (T*)&beta, - dy_array, + d_beta, + dy.ptr_on_device(), incy, batch_count)); + CHECK_HIP_ERROR(hy_device.transfer_from(dy)); + /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_gemv( - transA, M, N, alpha, hA_array[b], lda, hx_array[b], incx, beta, hz_array[b], incy); - } - - // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - hipMemcpy(hy_array[b], by_array[b], sizeof(T) * Y_size, hipMemcpyDeviceToHost); + cblas_gemv(transA, M, N, h_alpha, hA[b], lda, hx[b], incx, h_beta, hy_cpu[b], incy); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, Y_size, batch_count, incy, hz_array, hy_array); + unit_check_general(1, Y_size, batch_count, incy, hy_cpu, hy_host); + unit_check_general(1, Y_size, batch_count, incy, hy_cpu, hy_device); } if(argus.norm_check) { - rocblas_error - = norm_check_general('F', 1, Y_size, incy, hz_array, hy_array, batch_count); + hipblas_error_host + = norm_check_general('F', 1, Y_size, incy, hy_cpu, hy_host, batch_count); + hipblas_error_device + = norm_check_general('F', 1, Y_size, incy, hy_cpu, hy_device, batch_count); } } if(argus.timing) { + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); hipStream_t stream; CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); @@ -194,13 +180,13 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) transA, M, N, - (T*)&alpha, - dA_array, + d_alpha, + dA.ptr_on_device(), lda, - dx_array, + dx.ptr_on_device(), incx, - (T*)&beta, - dy_array, + d_beta, + dy.ptr_on_device(), incy, batch_count)); } @@ -220,7 +206,8 @@ hipblasStatus_t testing_gemvBatched(const Arguments& argus) gpu_time_used, gemv_gflop_count(transA, M, N), gemv_gbyte_count(transA, M, N), - rocblas_error); + hipblas_error_host, + hipblas_error_device); } return HIPBLAS_STATUS_SUCCESS; diff --git a/clients/include/testing_gemv_strided_batched.hpp b/clients/include/testing_gemv_strided_batched.hpp index f02774530..04e70886e 100644 --- a/clients/include/testing_gemv_strided_batched.hpp +++ b/clients/include/testing_gemv_strided_batched.hpp @@ -15,7 +15,7 @@ using namespace std; /* ============================================================================================ */ template -hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) +hipblasStatus_t testing_gemv_strided_batched(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasGemvStridedBatchedFn @@ -41,7 +41,7 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) int y_els; hipblasOperation_t transA = char2hipblas_operation(argus.transA_option); - // transA = HIPBLAS_OP_T; + if(transA == HIPBLAS_OP_N) { x_els = N; @@ -71,18 +71,20 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) host_vector hA(A_size); host_vector hx(X_size); host_vector hy(Y_size); - host_vector hz(Y_size); + host_vector hy_cpu(Y_size); + host_vector hy_host(Y_size); + host_vector hy_device(Y_size); device_vector dA(A_size); device_vector dx(X_size); device_vector dy(Y_size); + device_vector d_alpha(1); + device_vector d_beta(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = (T)argus.alpha; - T beta = (T)argus.beta; + T h_alpha = (T)argus.alpha; + T h_beta = (T)argus.beta; hipblasLocalHandle handle(argus); @@ -92,13 +94,15 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) hipblas_init(hx, 1, x_els, incx, stride_x, batch_count); hipblas_init(hy, 1, y_els, incy, stride_y, batch_count); - // copy vector is easy in STL; hz = hy: save a copy in hz which will be output of CPU BLAS - hz = hy; + // copy vector is easy in STL; hy_cpu = hy: save a copy in hy_cpu which will be output of CPU BLAS + hy_cpu = hy; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * X_size, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * X_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); if(argus.unit_check || argus.norm_check) { @@ -106,23 +110,47 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) HIPBLAS =================================================================== */ + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); CHECK_HIPBLAS_ERROR(hipblasGemvStridedBatchedFn(handle, transA, M, N, - (T*)&alpha, + (T*)&h_alpha, dA, lda, stride_A, dx, incx, stride_x, - (T*)&beta, + (T*)&h_beta, dy, incy, stride_y, batch_count)); + CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyDeviceToHost)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasGemvStridedBatchedFn(handle, + transA, + M, + N, + d_alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + d_beta, + dy, + incy, + stride_y, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hy_device.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); + /* ===================================================================== CPU BLAS =================================================================== */ @@ -131,34 +159,36 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) cblas_gemv(transA, M, N, - alpha, + h_alpha, hA.data() + b * stride_A, lda, hx.data() + b * stride_x, incx, - beta, - hz.data() + b * stride_y, + h_beta, + hy_cpu.data() + b * stride_y, incy); } - // copy output from device to CPU - hipMemcpy(hy.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost); - // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, y_els, batch_count, incy, stride_y, hz, hy); + unit_check_general(1, y_els, batch_count, incy, stride_y, hy_cpu, hy_host); + unit_check_general(1, y_els, batch_count, incy, stride_y, hy_cpu, hy_device); } if(argus.norm_check) { - rocblas_error - = norm_check_general('F', 1, y_els, incy, stride_y, hz, hy, batch_count); + hipblas_error_host = norm_check_general( + 'F', 1, y_els, incy, stride_y, hy_cpu, hy_host, batch_count); + hipblas_error_device = norm_check_general( + 'F', 1, y_els, incy, stride_y, hy_cpu, hy_device, batch_count); } } if(argus.timing) { + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice); hipStream_t stream; CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); @@ -173,14 +203,14 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) transA, M, N, - (T*)&alpha, + d_alpha, dA, lda, stride_A, dx, incx, stride_x, - (T*)&beta, + d_beta, dy, incy, stride_y, @@ -206,7 +236,8 @@ hipblasStatus_t testing_gemvStridedBatched(const Arguments& argus) gpu_time_used, gemv_gflop_count(transA, M, N), gemv_gbyte_count(transA, M, N), - rocblas_error); + hipblas_error_host, + hipblas_error_device); } return HIPBLAS_STATUS_SUCCESS; diff --git a/clients/include/testing_ger.hpp b/clients/include/testing_ger.hpp index 8bcc19516..57d2c5fc1 100644 --- a/clients/include/testing_ger.hpp +++ b/clients/include/testing_ger.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -29,54 +29,31 @@ hipblasStatus_t testing_ger(const Arguments& argus) int A_size = lda * N; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory - if(M < 0) - { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; - } - if(N < 0) - { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; - } - else if(lda < 0) - { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; - } - else if(incx <= 0) + if(M < 0 || N < 0 || lda < 0 || incx <= 0 || incy <= 0) { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; - } - else if(incy <= 0) - { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; + return HIPBLAS_STATUS_INVALID_VALUE; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); - host_vector hB(A_size); + host_vector hA_host(A_size); + host_vector hA_device(A_size); + host_vector hA_cpu(A_size); host_vector hx(M * incx); host_vector hy(N * incy); device_vector dA(A_size); device_vector dx(M * incx); device_vector dy(N * incy); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); + T h_alpha = argus.get_alpha(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -85,51 +62,76 @@ hipblasStatus_t testing_ger(const Arguments& argus) hipblas_init(hy, 1, N, incy); // copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS - hB = hA; + hA_cpu = hA; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * M * incx, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * M * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasGerFn(handle, M, N, (T*)&h_alpha, dx, incx, dy, incy, dA, lda)); - for(int iter = 0; iter < 1; iter++) - { + CHECK_HIP_ERROR(hipMemcpy(hA_host.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); - status = hipblasGerFn(handle, M, N, (T*)&alpha, dx, incx, dy, incy, dA, lda); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasGerFn(handle, M, N, d_alpha, dx, incx, dy, incy, dA, lda)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIP_ERROR(hipMemcpy(hA_device.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); - // copy output from device to CPU - hipMemcpy(hA.data(), dA, sizeof(T) * N * lda, hipMemcpyDeviceToHost); - - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ - cblas_ger(M, N, alpha, hx.data(), incx, hy.data(), incy, hB.data(), lda); + cblas_ger(M, N, h_alpha, hx.data(), incx, hy.data(), incy, hA_cpu.data(), lda); // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(M, N, lda, hB.data(), hA.data()); + unit_check_general(M, N, lda, hA_cpu.data(), hA_host.data()); + unit_check_general(M, N, lda, hA_cpu.data(), hA_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', M, N, lda, hA_cpu.data(), hA_host.data()); + hipblas_error_device + = norm_check_general('F', M, N, lda, hA_cpu.data(), hA_device.data()); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasGerFn(handle, M, N, d_alpha, dx, incx, dy, incy, dA, lda)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + ger_gflop_count(M, N), + ger_gbyte_count(M, N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_ger_batched.hpp b/clients/include/testing_ger_batched.hpp index 089a5ba6e..a0b472075 100644 --- a/clients/include/testing_ger_batched.hpp +++ b/clients/include/testing_ger_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -33,13 +33,9 @@ hipblasStatus_t testing_ger_batched(const Arguments& argus) int x_size = M * incx; int y_size = N * incy; - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); - - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + T h_alpha = argus.get_alpha(); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -52,97 +48,134 @@ hipblasStatus_t testing_ger_batched(const Arguments& argus) return HIPBLAS_STATUS_SUCCESS; } - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory - host_vector hA[batch_count]; - host_vector hB[batch_count]; - host_vector hx[batch_count]; - host_vector hy[batch_count]; - - device_batch_vector bA(batch_count, A_size); - device_batch_vector bx(batch_count, x_size); - device_batch_vector by(batch_count, y_size); - - device_vector dA(batch_count); - device_vector dx(batch_count); - device_vector dy(batch_count); - - int last = batch_count - 1; - if(!dA || !dx || !dy || (!bA[last] && A_size) || (!bx[last] && x_size) || (!by[last] && y_size)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } - - // Initial Data on CPU - srand(1); - for(int b = 0; b < batch_count; b++) - { - hA[b] = host_vector(A_size); - hB[b] = host_vector(A_size); - hx[b] = host_vector(x_size); - hy[b] = host_vector(y_size); - - srand(1); - hipblas_init(hA[b], M, N, lda); - hipblas_init(hx[b], 1, M, incx); - hipblas_init(hy[b], 1, N, incy); - hB[b] = hA[b]; - - CHECK_HIP_ERROR(hipMemcpy(bA[b], hA[b], sizeof(T) * A_size, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(bx[b], hx[b], sizeof(T) * x_size, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(by[b], hy[b], sizeof(T) * y_size, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR(hipMemcpy(dA, bA, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dx, bx, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy, by, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); + host_batch_vector hA(A_size, 1, batch_count); + host_batch_vector hA_cpu(A_size, 1, batch_count); + host_batch_vector hA_host(A_size, 1, batch_count); + host_batch_vector hA_device(A_size, 1, batch_count); + host_batch_vector hx(M, incx, batch_count); + host_batch_vector hy(N, incy, batch_count); + + device_batch_vector dA(A_size, 1, batch_count); + device_batch_vector dx(M, incx, batch_count); + device_batch_vector dy(N, incy, batch_count); + device_vector d_alpha(1); + + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy.memcheck()); + + hipblas_init(hA, true); + hipblas_init(hx); + hipblas_init(hy); + + hA_cpu.copy_from(hA); + hA_host.copy_from(hA); + hA_device.copy_from(hA); + + CHECK_HIP_ERROR(dA.transfer_from(hA)); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } - - for(int iter = 0; iter < 1; iter++) - { - status = hipblasGerBatchedFn( - handle, M, N, (T*)&alpha, dx, incx, dy, incy, dA, lda, batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - hipMemcpy(hA[b], bA[b], sizeof(T) * A_size, hipMemcpyDeviceToHost); - } - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasGerBatchedFn(handle, + M, + N, + (T*)&h_alpha, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + dA.ptr_on_device(), + lda, + batch_count)); + + CHECK_HIP_ERROR(hA_host.transfer_from(dA)); + CHECK_HIP_ERROR(dA.transfer_from(hA)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasGerBatchedFn(handle, + M, + N, + d_alpha, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + dA.ptr_on_device(), + lda, + batch_count)); + + CHECK_HIP_ERROR(hA_device.transfer_from(dA)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_ger(M, N, alpha, hx[b], incx, hy[b], incy, hB[b], lda); + cblas_ger(M, N, h_alpha, hx[b], incx, hy[b], incy, hA_cpu[b], lda); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(M, N, batch_count, lda, hB, hA); + unit_check_general(M, N, batch_count, lda, hA_cpu, hA_host); + unit_check_general(M, N, batch_count, lda, hA_cpu, hA_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', M, N, lda, hA_cpu, hA_host, batch_count); + hipblas_error_device + = norm_check_general('F', M, N, lda, hA_cpu, hA_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(dA.transfer_from(hA)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasGerBatchedFn(handle, + M, + N, + d_alpha, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + dA.ptr_on_device(), + lda, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + ger_gflop_count(M, N), + ger_gbyte_count(M, N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_ger_strided_batched.hpp b/clients/include/testing_ger_strided_batched.hpp index e539b0d27..026d916b9 100644 --- a/clients/include/testing_ger_strided_batched.hpp +++ b/clients/include/testing_ger_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -38,8 +38,6 @@ hipblasStatus_t testing_ger_strided_batched(const Arguments& argus) int x_size = stride_x * batch_count; int y_size = stride_y * batch_count; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(M < 0 || N < 0 || lda < 0 || incx <= 0 || incy <= 0 || batch_count < 0) @@ -53,22 +51,22 @@ hipblasStatus_t testing_ger_strided_batched(const Arguments& argus) // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); - host_vector hB(A_size); + host_vector hA_cpu(A_size); + host_vector hA_host(A_size); + host_vector hA_device(A_size); host_vector hx(x_size); host_vector hy(y_size); device_vector dA(A_size); device_vector dx(x_size); device_vector dy(y_size); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = (T)argus.alpha; + T h_alpha = (T)argus.alpha; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -77,50 +75,55 @@ hipblasStatus_t testing_ger_strided_batched(const Arguments& argus) hipblas_init(hy, 1, N, incy, stride_y, batch_count); // copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS - hB = hA; + hA_cpu = hA; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * y_size, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * y_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } - - for(int iter = 0; iter < 1; iter++) - { - - status = hipblasGerStridedBatchedFn(handle, - M, - N, - (T*)&alpha, - dx, - incx, - stride_x, - dy, - incy, - stride_y, - dA, - lda, - stride_A, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - hipMemcpy(hA.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost); - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasGerStridedBatchedFn(handle, + M, + N, + (T*)&h_alpha, + dx, + incx, + stride_x, + dy, + incy, + stride_y, + dA, + lda, + stride_A, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hA_host.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasGerStridedBatchedFn(handle, + M, + N, + d_alpha, + dx, + incx, + stride_x, + dy, + incy, + stride_y, + dA, + lda, + stride_A, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hA_device.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -129,12 +132,12 @@ hipblasStatus_t testing_ger_strided_batched(const Arguments& argus) { cblas_ger(M, N, - alpha, + h_alpha, hx.data() + b * stride_x, incx, hy.data() + b * stride_y, incy, - hB.data() + b * stride_A, + hA_cpu.data() + b * stride_A, lda); } @@ -142,10 +145,66 @@ hipblasStatus_t testing_ger_strided_batched(const Arguments& argus) // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(M, N, batch_count, lda, stride_A, hB.data(), hA.data()); + unit_check_general(M, N, batch_count, lda, stride_A, hA_cpu.data(), hA_host.data()); + unit_check_general( + M, N, batch_count, lda, stride_A, hA_cpu.data(), hA_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general( + 'F', M, N, lda, stride_A, hA_cpu.data(), hA_host.data(), batch_count); + hipblas_error_device = norm_check_general( + 'F', M, N, lda, stride_A, hA_cpu.data(), hA_device.data(), batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasGerStridedBatchedFn(handle, + M, + N, + d_alpha, + dx, + incx, + stride_x, + dy, + incy, + stride_y, + dA, + lda, + stride_A, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + ger_gflop_count(M, N), + ger_gbyte_count(M, N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hbmv.hpp b/clients/include/testing_hbmv.hpp index df1d4fb79..144dd4b53 100644 --- a/clients/include/testing_hbmv.hpp +++ b/clients/include/testing_hbmv.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -28,36 +28,35 @@ hipblasStatus_t testing_hbmv(const Arguments& argus) int A_size = lda * N; - hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || K < 0 || lda < K + 1 || incx == 0 || incy == 0) { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; + return HIPBLAS_STATUS_INVALID_VALUE; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); host_vector hx(N * incx); host_vector hy(N * incy); - host_vector hz(N * incy); + host_vector hy_cpu(N * incy); + host_vector hy_host(N * incy); + host_vector hy_device(N * incy); device_vector dA(A_size); device_vector dx(N * incx); device_vector dy(N * incy); + device_vector d_alpha(1); + device_vector d_beta(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); - T beta = argus.get_beta(); + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -66,48 +65,79 @@ hipblasStatus_t testing_hbmv(const Arguments& argus) hipblas_init(hy, 1, N, incy); // copy vector is easy in STL; hz = hy: save a copy in hz which will be output of CPU BLAS - hz = hy; + hy_cpu = hy; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR( + hipblasHbmvFn(handle, uplo, N, K, (T*)&h_alpha, dA, lda, dx, incx, (T*)&h_beta, dy, incy)); - status - = hipblasHbmvFn(handle, uplo, N, K, (T*)&alpha, dA, lda, dx, incx, (T*)&beta, dy, incy); + CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy, sizeof(T) * N * incy, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR( + hipblasHbmvFn(handle, uplo, N, K, d_alpha, dA, lda, dx, incx, d_beta, dy, incy)); - // copy output from device to CPU - hipMemcpy(hy.data(), dy, sizeof(T) * N * incy, hipMemcpyDeviceToHost); + CHECK_HIP_ERROR(hipMemcpy(hy_device.data(), dy, sizeof(T) * N * incy, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ - cblas_hbmv(uplo, N, K, alpha, hA.data(), lda, hx.data(), incx, beta, hz.data(), incy); + cblas_hbmv( + uplo, N, K, h_alpha, hA.data(), lda, hx.data(), incx, h_beta, hy_cpu.data(), incy); // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, N, incy, hz, hy); + unit_check_general(1, N, incy, hy_cpu, hy_host); + unit_check_general(1, N, incy, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general('F', 1, N, incy, hy_cpu, hy_host); + hipblas_error_device = norm_check_general('F', 1, N, incy, hy_cpu, hy_device); } } - hipblasDestroy(handle); + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR( + hipblasHbmvFn(handle, uplo, N, K, d_alpha, dA, lda, dx, incx, d_beta, dy, incy)); + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + hbmv_gflop_count(N, K), + hbmv_gbyte_count(N, K), + hipblas_error_host, + hipblas_error_device); + } return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hbmv_batched.hpp b/clients/include/testing_hbmv_batched.hpp index 57b5ca734..26ecd0d21 100644 --- a/clients/include/testing_hbmv_batched.hpp +++ b/clients/include/testing_hbmv_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -37,8 +37,6 @@ hipblasStatus_t testing_hbmv_batched(const Arguments& argus) X_size = N * incx; Y_size = N * incy; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || K < 0 || lda < K + 1 || incx == 0 || incy == 0 || batch_count < 0) @@ -50,111 +48,83 @@ hipblasStatus_t testing_hbmv_batched(const Arguments& argus) return HIPBLAS_STATUS_SUCCESS; } - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); - T beta = argus.get_beta(); + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); // arrays of pointers-to-host on host - host_vector hA_array[batch_count]; - host_vector hx_array[batch_count]; - host_vector hy_array[batch_count]; - host_vector hz_array[batch_count]; - - // arrays of pointers-to-device on host - device_batch_vector bA_array(batch_count, A_size); - device_batch_vector bx_array(batch_count, X_size); - device_batch_vector by_array(batch_count, Y_size); - - // arrays of pointers-to-device on device - device_vector dA_array(batch_count); - device_vector dx_array(batch_count); - device_vector dy_array(batch_count); - - int last = batch_count - 1; - if(!dA_array || !dx_array || !dy_array || (!bA_array[last] && A_size) - || (!bx_array[last] && X_size) || (!by_array[last] && Y_size)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } + host_batch_vector hA(A_size, 1, batch_count); + host_batch_vector hx(N, incx, batch_count); + host_batch_vector hy(N, incy, batch_count); + host_batch_vector hy_cpu(N, incy, batch_count); + host_batch_vector hy_host(N, incy, batch_count); + host_batch_vector hy_device(N, incy, batch_count); + + // device arrays + device_batch_vector dA(A_size, 1, batch_count); + device_batch_vector dx(N, incx, batch_count); + device_batch_vector dy(N, incy, batch_count); + device_vector d_alpha(1); + device_vector d_beta(1); + + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy.memcheck()); // Initial Data on CPU - hipError_t err_A, err_x, err_y; - srand(1); - for(int b = 0; b < batch_count; b++) - { - hA_array[b] = host_vector(A_size); - hx_array[b] = host_vector(X_size); - hy_array[b] = host_vector(Y_size); - hz_array[b] = host_vector(Y_size); - - // initialize matrices on host - srand(1); - hipblas_init(hA_array[b], K, N, lda); - hipblas_init(hx_array[b], 1, N, incx); - hipblas_init(hy_array[b], 1, N, incy); - - hz_array[b] = hy_array[b]; - err_A = hipMemcpy(bA_array[b], hA_array[b], sizeof(T) * A_size, hipMemcpyHostToDevice); - err_x = hipMemcpy(bx_array[b], hx_array[b], sizeof(T) * X_size, hipMemcpyHostToDevice); - err_y = hipMemcpy(by_array[b], hy_array[b], sizeof(T) * Y_size, hipMemcpyHostToDevice); - - if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_MAPPING_ERROR; - } - } + hipblas_init(hA, true); + hipblas_init(hx); + hipblas_init(hy); + hy_cpu.copy_from(hy); - err_A = hipMemcpy(dA_array, bA_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - err_x = hipMemcpy(dx_array, bx_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - err_y = hipMemcpy(dy_array, by_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_MAPPING_ERROR; - } + CHECK_HIP_ERROR(dA.transfer_from(hA)); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHbmvBatchedFn(handle, - uplo, - N, - K, - (T*)&alpha, - dA_array, - lda, - dx_array, - incx, - (T*)&beta, - dy_array, - incy, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - hipMemcpy(hy_array[b], by_array[b], sizeof(T) * Y_size, hipMemcpyDeviceToHost); - } - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHbmvBatchedFn(handle, + uplo, + N, + K, + (T*)&h_alpha, + dA.ptr_on_device(), + lda, + dx.ptr_on_device(), + incx, + (T*)&h_beta, + dy.ptr_on_device(), + incy, + batch_count)); + + CHECK_HIP_ERROR(hy_host.transfer_from(dy)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHbmvBatchedFn(handle, + uplo, + N, + K, + d_alpha, + dA.ptr_on_device(), + lda, + dx.ptr_on_device(), + incx, + d_beta, + dy.ptr_on_device(), + incy, + batch_count)); + + CHECK_HIP_ERROR(hy_device.transfer_from(dy)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -162,18 +132,63 @@ hipblasStatus_t testing_hbmv_batched(const Arguments& argus) for(int b = 0; b < batch_count; b++) { - cblas_hbmv( - uplo, N, K, alpha, hA_array[b], lda, hx_array[b], incx, beta, hz_array[b], incy); + cblas_hbmv(uplo, N, K, h_alpha, hA[b], lda, hx[b], incx, h_beta, hy_cpu[b], incy); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, Y_size, batch_count, incy, hz_array, hy_array); + unit_check_general(1, Y_size, batch_count, incy, hy_cpu, hy_host); + unit_check_general(1, Y_size, batch_count, incy, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, N, incy, hy_cpu, hy_host, batch_count); + hipblas_error_device + = norm_check_general('F', 1, N, incy, hy_cpu, hy_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(dy.transfer_from(hy)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHbmvBatchedFn(handle, + uplo, + N, + K, + d_alpha, + dA.ptr_on_device(), + lda, + dx.ptr_on_device(), + incx, + d_beta, + dy.ptr_on_device(), + incy, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + hbmv_gflop_count(N, K), + hbmv_gbyte_count(N, K), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hbmv_strided_batched.hpp b/clients/include/testing_hbmv_strided_batched.hpp index d77b04b84..40a2edfd1 100644 --- a/clients/include/testing_hbmv_strided_batched.hpp +++ b/clients/include/testing_hbmv_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -43,8 +43,6 @@ hipblasStatus_t testing_hbmv_strided_batched(const Arguments& argus) X_size = stride_x * batch_count; Y_size = stride_y * batch_count; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || K < 0 || lda < K + 1 || incx == 0 || incy == 0 || batch_count < 0) @@ -56,70 +54,84 @@ hipblasStatus_t testing_hbmv_strided_batched(const Arguments& argus) host_vector hA(A_size); host_vector hx(X_size); host_vector hy(Y_size); - host_vector hz(Y_size); + host_vector hy_cpu(Y_size); + host_vector hy_host(Y_size); + host_vector hy_device(Y_size); device_vector dA(A_size); device_vector dx(X_size); device_vector dy(Y_size); + device_vector d_alpha(1); + device_vector d_beta(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); - T beta = argus.get_beta(); + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); - hipblas_init(hA, N, N, lda, stride_A, batch_count); + hipblas_init(hA, K, N, lda, stride_A, batch_count); hipblas_init(hx, 1, N, incx, stride_x, batch_count); hipblas_init(hy, 1, N, incy, stride_y, batch_count); - // copy vector is easy in STL; hz = hy: save a copy in hz which will be output of CPU BLAS - hz = hy; + // copy vector is easy in STL; hy_cpu = hy: save a copy in hy_cpu which will be output of CPU BLAS + hy_cpu = hy; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * X_size, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * X_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHbmvStridedBatchedFn(handle, - uplo, - N, - K, - (T*)&alpha, - dA, - lda, - stride_A, - dx, - incx, - stride_x, - (T*)&beta, - dy, - incy, - stride_y, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - hipMemcpy(hy.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost); - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHbmvStridedBatchedFn(handle, + uplo, + N, + K, + (T*)&h_alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + (T*)&h_beta, + dy, + incy, + stride_y, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHbmvStridedBatchedFn(handle, + uplo, + N, + K, + d_alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + d_beta, + dy, + incy, + stride_y, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hy_device.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -130,13 +142,13 @@ hipblasStatus_t testing_hbmv_strided_batched(const Arguments& argus) cblas_hbmv(uplo, N, K, - alpha, + h_alpha, hA.data() + b * stride_A, lda, hx.data() + b * stride_x, incx, - beta, - hz.data() + b * stride_y, + h_beta, + hy_cpu.data() + b * stride_y, incy); } @@ -144,10 +156,67 @@ hipblasStatus_t testing_hbmv_strided_batched(const Arguments& argus) // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, N, batch_count, incy, stride_y, hz, hy); + unit_check_general(1, N, batch_count, incy, stride_y, hy_cpu, hy_host); + unit_check_general(1, N, batch_count, incy, stride_y, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, N, incy, stride_y, hy_cpu, hy_host, batch_count); + hipblas_error_device + = norm_check_general('F', 1, N, incy, stride_y, hy_cpu, hy_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHbmvStridedBatchedFn(handle, + uplo, + N, + K, + d_alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + d_beta, + dy, + incy, + stride_y, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + hbmv_gflop_count(N, K), + hbmv_gbyte_count(N, K), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } From 5b938a7dce7a49445dff281ad9ce61a780d5a8f2 Mon Sep 17 00:00:00 2001 From: Daine McNiven <51674140+daineAMD@users.noreply.github.com> Date: Wed, 24 Mar 2021 19:55:17 -0600 Subject: [PATCH 5/8] Fix for nrm2_batched test. (#323) --- clients/include/testing_nrm2_batched.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clients/include/testing_nrm2_batched.hpp b/clients/include/testing_nrm2_batched.hpp index 834fb201c..d5e60536d 100644 --- a/clients/include/testing_nrm2_batched.hpp +++ b/clients/include/testing_nrm2_batched.hpp @@ -40,8 +40,7 @@ hipblasStatus_t testing_nrm2_batched(const Arguments& argus) double gpu_time_used; double hipblas_error_host = 0, hipblas_error_device = 0; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice host_batch_vector hx(N, incx, batch_count); From 6c07dbf8d29bf52c8c17c71d873334fb38be6ef1 Mon Sep 17 00:00:00 2001 From: Daine McNiven <51674140+daineAMD@users.noreply.github.com> Date: Thu, 25 Mar 2021 13:02:50 -0600 Subject: [PATCH 6/8] Updating hemv, her, her2, hpmv, hpr, hpr2 tests (#322) --- clients/benchmarks/client.cpp | 36 +-- clients/gtest/hemv_batched_gtest.cpp | 4 +- clients/gtest/hemv_strided_batched_gtest.cpp | 4 +- clients/gtest/her_gtest.cpp | 14 +- clients/gtest/hpr_gtest.cpp | 14 +- clients/include/bytes.hpp | 14 ++ clients/include/testing_gbmv.hpp | 6 +- clients/include/testing_gbmv_batched.hpp | 20 +- .../include/testing_gbmv_strided_batched.hpp | 6 +- clients/include/testing_gemv.hpp | 4 +- clients/include/testing_gemv_batched.hpp | 4 +- .../include/testing_gemv_strided_batched.hpp | 4 +- clients/include/testing_ger.hpp | 15 +- clients/include/testing_ger_batched.hpp | 2 +- .../include/testing_ger_strided_batched.hpp | 3 +- clients/include/testing_hbmv.hpp | 15 +- clients/include/testing_hbmv_batched.hpp | 16 +- .../include/testing_hbmv_strided_batched.hpp | 2 + clients/include/testing_hemv.hpp | 101 +++++--- clients/include/testing_hemv_batched.hpp | 217 ++++++++++-------- .../include/testing_hemv_strided_batched.hpp | 174 +++++++++----- clients/include/testing_her.hpp | 94 +++++--- clients/include/testing_her2.hpp | 94 +++++--- clients/include/testing_her2_batched.hpp | 185 ++++++++------- .../include/testing_her2_strided_batched.hpp | 166 +++++++++----- clients/include/testing_her_batched.hpp | 156 +++++++------ .../include/testing_her_strided_batched.hpp | 108 ++++++--- clients/include/testing_hpmv.hpp | 99 +++++--- clients/include/testing_hpmv_batched.hpp | 206 +++++++++-------- .../include/testing_hpmv_strided_batched.hpp | 167 ++++++++++---- clients/include/testing_hpr.hpp | 94 +++++--- clients/include/testing_hpr2.hpp | 95 +++++--- clients/include/testing_hpr2_batched.hpp | 182 ++++++++------- .../include/testing_hpr2_strided_batched.hpp | 162 ++++++++----- clients/include/testing_hpr_batched.hpp | 139 ++++++----- .../include/testing_hpr_strided_batched.hpp | 103 ++++++--- 36 files changed, 1667 insertions(+), 1058 deletions(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 49d12e2e7..2c68fbbb7 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -542,26 +542,26 @@ struct perf_blas< {"hbmv", testing_hbmv}, {"hbmv_batched", testing_hbmv_batched}, {"hbmv_strided_batched", testing_hbmv_strided_batched}, + {"hemv", testing_hemv}, + {"hemv_batched", testing_hemv_batched}, + {"hemv_strided_batched", testing_hemv_strided_batched}, + {"her", testing_her}, + {"her_batched", testing_her_batched}, + {"her_strided_batched", testing_her_strided_batched}, + {"her2", testing_her2}, + {"her2_batched", testing_her2_batched}, + {"her2_strided_batched", testing_her2_strided_batched}, + {"hpmv", testing_hpmv}, + {"hpmv_batched", testing_hpmv_batched}, + {"hpmv_strided_batched", testing_hpmv_strided_batched}, + {"hpr", testing_hpr}, + {"hpr_batched", testing_hpr_batched}, + {"hpr_strided_batched", testing_hpr_strided_batched}, + {"hpr2", testing_hpr2}, + {"hpr2_batched", testing_hpr2_batched}, + {"hpr2_strided_batched", testing_hpr2_strided_batched}, /* // L2 - {"hemv", testing_hemv}, - {"hemv_batched", testing_hemv_batched}, - {"hemv_strided_batched", testing_hemv_strided_batched}, - {"her", testing_her}, - {"her_batched", testing_her_batched}, - {"her_strided_batched", testing_her_strided_batched}, - {"her2", testing_her2}, - {"her2_batched", testing_her2_batched}, - {"her2_strided_batched", testing_her2_strided_batched}, - {"hpmv", testing_hpmv}, - {"hpmv_batched", testing_hpmv_batched}, - {"hpmv_strided_batched", testing_hpmv_strided_batched}, - {"hpr", testing_hpr}, - {"hpr_batched", testing_hpr_batched}, - {"hpr_strided_batched", testing_hpr_strided_batched}, - {"hpr2", testing_hpr2}, - {"hpr2_batched", testing_hpr2_batched}, - {"hpr2_strided_batched", testing_hpr2_strided_batched}, {"spr", testing_spr}, {"spr_batched", testing_spr_batched}, {"spr_strided_batched", testing_spr_strided_batched}, diff --git a/clients/gtest/hemv_batched_gtest.cpp b/clients/gtest/hemv_batched_gtest.cpp index efc80c068..d5dfe93b5 100644 --- a/clients/gtest/hemv_batched_gtest.cpp +++ b/clients/gtest/hemv_batched_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -145,7 +145,7 @@ TEST_P(hemv_gtest_batched, hemv_gtest_float_complex) { Arguments arg = setup_hemv_arguments(GetParam()); - hipblasStatus_t status = testing_hemvBatched(arg); + hipblasStatus_t status = testing_hemv_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) diff --git a/clients/gtest/hemv_strided_batched_gtest.cpp b/clients/gtest/hemv_strided_batched_gtest.cpp index ee37b382f..048431873 100644 --- a/clients/gtest/hemv_strided_batched_gtest.cpp +++ b/clients/gtest/hemv_strided_batched_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -156,7 +156,7 @@ TEST_P(hemv_gtest_strided_batched, hemv_gtest_float_complex) { Arguments arg = setup_hemv_arguments(GetParam()); - hipblasStatus_t status = testing_hemvStridedBatched(arg); + hipblasStatus_t status = testing_hemv_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) diff --git a/clients/gtest/her_gtest.cpp b/clients/gtest/her_gtest.cpp index cc1a2ab65..ca4d56f60 100644 --- a/clients/gtest/her_gtest.cpp +++ b/clients/gtest/her_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -131,7 +131,7 @@ TEST_P(blas2_her_gtest, her_gtest_float) Arguments arg = setup_her_arguments(GetParam()); - hipblasStatus_t status = testing_her(arg); + hipblasStatus_t status = testing_her(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -156,7 +156,7 @@ TEST_P(blas2_her_gtest, her_gtest_double) Arguments arg = setup_her_arguments(GetParam()); - hipblasStatus_t status = testing_her(arg); + hipblasStatus_t status = testing_her(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -182,7 +182,7 @@ TEST_P(blas2_her_gtest, her_batched_gtest_float) Arguments arg = setup_her_arguments(GetParam()); - hipblasStatus_t status = testing_her_batched(arg); + hipblasStatus_t status = testing_her_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -207,7 +207,7 @@ TEST_P(blas2_her_gtest, her_batched_gtest_double) Arguments arg = setup_her_arguments(GetParam()); - hipblasStatus_t status = testing_her_batched(arg); + hipblasStatus_t status = testing_her_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -233,7 +233,7 @@ TEST_P(blas2_her_gtest, her_strided_batched_gtest_float) Arguments arg = setup_her_arguments(GetParam()); - hipblasStatus_t status = testing_her_strided_batched(arg); + hipblasStatus_t status = testing_her_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -258,7 +258,7 @@ TEST_P(blas2_her_gtest, her_strided_batched_gtest_double) Arguments arg = setup_her_arguments(GetParam()); - hipblasStatus_t status = testing_her_strided_batched(arg); + hipblasStatus_t status = testing_her_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) diff --git a/clients/gtest/hpr_gtest.cpp b/clients/gtest/hpr_gtest.cpp index bdadae77e..b7ecb54da 100644 --- a/clients/gtest/hpr_gtest.cpp +++ b/clients/gtest/hpr_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -113,7 +113,7 @@ TEST_P(blas2_hpr_gtest, hpr_gtest_float) Arguments arg = setup_hpr_arguments(GetParam()); - hipblasStatus_t status = testing_hpr(arg); + hipblasStatus_t status = testing_hpr(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -138,7 +138,7 @@ TEST_P(blas2_hpr_gtest, hpr_gtest_double) Arguments arg = setup_hpr_arguments(GetParam()); - hipblasStatus_t status = testing_hpr(arg); + hipblasStatus_t status = testing_hpr(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -164,7 +164,7 @@ TEST_P(blas2_hpr_gtest, hpr_batched_gtest_float) Arguments arg = setup_hpr_arguments(GetParam()); - hipblasStatus_t status = testing_hpr_batched(arg); + hipblasStatus_t status = testing_hpr_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -189,7 +189,7 @@ TEST_P(blas2_hpr_gtest, hpr_batched_gtest_double) Arguments arg = setup_hpr_arguments(GetParam()); - hipblasStatus_t status = testing_hpr_batched(arg); + hipblasStatus_t status = testing_hpr_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -215,7 +215,7 @@ TEST_P(blas2_hpr_gtest, hpr_strided_batched_gtest_float) Arguments arg = setup_hpr_arguments(GetParam()); - hipblasStatus_t status = testing_hpr_strided_batched(arg); + hipblasStatus_t status = testing_hpr_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) @@ -240,7 +240,7 @@ TEST_P(blas2_hpr_gtest, hpr_strided_batched_gtest_double) Arguments arg = setup_hpr_arguments(GetParam()); - hipblasStatus_t status = testing_hpr_strided_batched(arg); + hipblasStatus_t status = testing_hpr_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) diff --git a/clients/include/bytes.hpp b/clients/include/bytes.hpp index 364ac3b28..77392afaf 100644 --- a/clients/include/bytes.hpp +++ b/clients/include/bytes.hpp @@ -136,6 +136,20 @@ constexpr double hbmv_gbyte_count(int n, int k) return (sizeof(T) * (n * k1 - ((k1 * (k1 + 1)) / 2.0) + 3 * n)) / 1e9; } +/* \brief byte counts of HEMV */ +template +constexpr double hemv_gbyte_count(int n) +{ + return (sizeof(T) * (((n * (n + 1.0)) / 2.0) + 3.0 * n)) / 1e9; +} + +/* \brief byte counts of HPMV */ +template +constexpr double hpmv_gbyte_count(int n) +{ + return (sizeof(T) * ((n * (n + 1.0)) / 2.0) + 3.0 * n) / 1e9; +} + /* \brief byte counts of HPR */ template constexpr double hpr_gbyte_count(int n) diff --git a/clients/include/testing_gbmv.hpp b/clients/include/testing_gbmv.hpp index 6b05e54d1..bc88d1e1c 100644 --- a/clients/include/testing_gbmv.hpp +++ b/clients/include/testing_gbmv.hpp @@ -68,8 +68,8 @@ hipblasStatus_t testing_gbmv(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - T h_alpha = (T)argus.alpha; - T h_beta = (T)argus.beta; + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); hipblasLocalHandle handle(argus); @@ -161,7 +161,7 @@ hipblasStatus_t testing_gbmv(const Arguments& argus) } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; - ArgumentModel{}.log_args( + ArgumentModel{}.log_args( std::cout, argus, gpu_time_used, diff --git a/clients/include/testing_gbmv_batched.hpp b/clients/include/testing_gbmv_batched.hpp index 3d5eec6e2..7fff174ec 100644 --- a/clients/include/testing_gbmv_batched.hpp +++ b/clients/include/testing_gbmv_batched.hpp @@ -66,8 +66,8 @@ hipblasStatus_t testing_gbmv_batched(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - T h_alpha = (T)argus.alpha; - T h_beta = (T)argus.beta; + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); // arrays of pointers-to-host on host host_batch_vector hA(A_size, 1, batch_count); @@ -201,14 +201,14 @@ hipblasStatus_t testing_gbmv_batched(const Arguments& argus) } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; - ArgumentModel{}.log_args( - std::cout, - argus, - gpu_time_used, - gbmv_gflop_count(transA, M, N, KL, KU), - gbmv_gbyte_count(transA, M, N, KL, KU), - hipblas_error_host, - hipblas_error_device); + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + gbmv_gflop_count(transA, M, N, KL, KU), + gbmv_gbyte_count(transA, M, N, KL, KU), + hipblas_error_host, + hipblas_error_device); } return HIPBLAS_STATUS_SUCCESS; diff --git a/clients/include/testing_gbmv_strided_batched.hpp b/clients/include/testing_gbmv_strided_batched.hpp index 7124a4574..743afc32b 100644 --- a/clients/include/testing_gbmv_strided_batched.hpp +++ b/clients/include/testing_gbmv_strided_batched.hpp @@ -86,8 +86,8 @@ hipblasStatus_t testing_gbmv_strided_batched(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - T h_alpha = (T)argus.alpha; - T h_beta = (T)argus.beta; + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); hipblasLocalHandle handle(argus); @@ -232,10 +232,12 @@ hipblasStatus_t testing_gbmv_strided_batched(const Arguments& argus) e_N, e_KL, e_KU, + e_alpha, e_lda, e_stride_a, e_incx, e_stride_x, + e_beta, e_incy, e_stride_y>{} .log_args(std::cout, diff --git a/clients/include/testing_gemv.hpp b/clients/include/testing_gemv.hpp index a97f4aaad..6b23ca832 100644 --- a/clients/include/testing_gemv.hpp +++ b/clients/include/testing_gemv.hpp @@ -66,8 +66,8 @@ hipblasStatus_t testing_gemv(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - T h_alpha = (T)argus.alpha; - T h_beta = (T)argus.beta; + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); hipblasLocalHandle handle(argus); diff --git a/clients/include/testing_gemv_batched.hpp b/clients/include/testing_gemv_batched.hpp index 20bf2b617..dcf1b8e60 100644 --- a/clients/include/testing_gemv_batched.hpp +++ b/clients/include/testing_gemv_batched.hpp @@ -63,8 +63,8 @@ hipblasStatus_t testing_gemv_batched(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - T h_alpha = (T)argus.alpha; - T h_beta = (T)argus.beta; + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); // arrays of pointers-to-host on host host_batch_vector hA(A_size, 1, batch_count); diff --git a/clients/include/testing_gemv_strided_batched.hpp b/clients/include/testing_gemv_strided_batched.hpp index 04e70886e..986f49733 100644 --- a/clients/include/testing_gemv_strided_batched.hpp +++ b/clients/include/testing_gemv_strided_batched.hpp @@ -83,8 +83,8 @@ hipblasStatus_t testing_gemv_strided_batched(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - T h_alpha = (T)argus.alpha; - T h_beta = (T)argus.beta; + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); hipblasLocalHandle handle(argus); diff --git a/clients/include/testing_ger.hpp b/clients/include/testing_ger.hpp index 57d2c5fc1..b6004de8e 100644 --- a/clients/include/testing_ger.hpp +++ b/clients/include/testing_ger.hpp @@ -124,13 +124,14 @@ hipblasStatus_t testing_ger(const Arguments& argus) } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; - ArgumentModel{}.log_args(std::cout, - argus, - gpu_time_used, - ger_gflop_count(M, N), - ger_gbyte_count(M, N), - hipblas_error_host, - hipblas_error_device); + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + ger_gflop_count(M, N), + ger_gbyte_count(M, N), + hipblas_error_host, + hipblas_error_device); } return HIPBLAS_STATUS_SUCCESS; diff --git a/clients/include/testing_ger_batched.hpp b/clients/include/testing_ger_batched.hpp index a0b472075..19c84d814 100644 --- a/clients/include/testing_ger_batched.hpp +++ b/clients/include/testing_ger_batched.hpp @@ -167,7 +167,7 @@ hipblasStatus_t testing_ger_batched(const Arguments& argus) } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; - ArgumentModel{}.log_args( + ArgumentModel{}.log_args( std::cout, argus, gpu_time_used, diff --git a/clients/include/testing_ger_strided_batched.hpp b/clients/include/testing_ger_strided_batched.hpp index 026d916b9..322228565 100644 --- a/clients/include/testing_ger_strided_batched.hpp +++ b/clients/include/testing_ger_strided_batched.hpp @@ -64,7 +64,7 @@ hipblasStatus_t testing_ger_strided_batched(const Arguments& argus) double gpu_time_used, hipblas_error_host, hipblas_error_device; - T h_alpha = (T)argus.alpha; + T h_alpha = argus.get_alpha(); hipblasLocalHandle handle(argus); @@ -190,6 +190,7 @@ hipblasStatus_t testing_ger_strided_batched(const Arguments& argus) ArgumentModel{}.log_args(std::cout, - argus, - gpu_time_used, - hbmv_gflop_count(N, K), - hbmv_gbyte_count(N, K), - hipblas_error_host, - hipblas_error_device); + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + hbmv_gflop_count(N, K), + hbmv_gbyte_count(N, K), + hipblas_error_host, + hipblas_error_device); } return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hbmv_batched.hpp b/clients/include/testing_hbmv_batched.hpp index 26ecd0d21..638525a67 100644 --- a/clients/include/testing_hbmv_batched.hpp +++ b/clients/include/testing_hbmv_batched.hpp @@ -180,14 +180,14 @@ hipblasStatus_t testing_hbmv_batched(const Arguments& argus) } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; - ArgumentModel{}.log_args( - std::cout, - argus, - gpu_time_used, - hbmv_gflop_count(N, K), - hbmv_gbyte_count(N, K), - hipblas_error_host, - hipblas_error_device); + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + hbmv_gflop_count(N, K), + hbmv_gbyte_count(N, K), + hipblas_error_host, + hipblas_error_device); } return HIPBLAS_STATUS_SUCCESS; diff --git a/clients/include/testing_hbmv_strided_batched.hpp b/clients/include/testing_hbmv_strided_batched.hpp index 40a2edfd1..7179c4233 100644 --- a/clients/include/testing_hbmv_strided_batched.hpp +++ b/clients/include/testing_hbmv_strided_batched.hpp @@ -202,10 +202,12 @@ hipblasStatus_t testing_hbmv_strided_batched(const Arguments& argus) ArgumentModel{} diff --git a/clients/include/testing_hemv.hpp b/clients/include/testing_hemv.hpp index 47573bd13..59d75aab7 100644 --- a/clients/include/testing_hemv.hpp +++ b/clients/include/testing_hemv.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -27,36 +27,35 @@ hipblasStatus_t testing_hemv(const Arguments& argus) int A_size = lda * N; - hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || lda < N || incx == 0 || incy == 0) { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; + return HIPBLAS_STATUS_INVALID_VALUE; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); host_vector hx(N * incx); host_vector hy(N * incy); - host_vector hz(N * incy); + host_vector hy_cpu(N * incy); + host_vector hy_host(N * incy); + host_vector hy_device(N * incy); device_vector dA(A_size); device_vector dx(N * incx); device_vector dy(N * incy); + device_vector d_alpha(1); + device_vector d_beta(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = (T)argus.alpha; - T beta = (T)argus.beta; + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -64,48 +63,82 @@ hipblasStatus_t testing_hemv(const Arguments& argus) hipblas_init(hx, 1, N, incx); hipblas_init(hy, 1, N, incy); - // copy vector is easy in STL; hz = hy: save a copy in hz which will be output of CPU BLAS - hz = hy; + // copy vector is easy in STL; hy_cpu = hy: save a copy in hy_cpu which will be output of CPU BLAS + hy_cpu = hy; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR( + hipblasHemvFn(handle, uplo, N, (T*)&h_alpha, dA, lda, dx, incx, (T*)&h_beta, dy, incy)); - status = hipblasHemvFn(handle, uplo, N, (T*)&alpha, dA, lda, dx, incx, (T*)&beta, dy, incy); + CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy, sizeof(T) * N * incy, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR( + hipblasHemvFn(handle, uplo, N, d_alpha, dA, lda, dx, incx, d_beta, dy, incy)); - // copy output from device to CPU - hipMemcpy(hy.data(), dy, sizeof(T) * N * incy, hipMemcpyDeviceToHost); + CHECK_HIP_ERROR(hipMemcpy(hy_device.data(), dy, sizeof(T) * N * incy, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ - cblas_hemv(uplo, N, alpha, hA.data(), lda, hx.data(), incx, beta, hz.data(), incy); + cblas_hemv( + uplo, N, h_alpha, hA.data(), lda, hx.data(), incx, h_beta, hy_cpu.data(), incy); // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, N, incy, hz, hy); + unit_check_general(1, N, incy, hy_cpu, hy_host); + unit_check_general(1, N, incy, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general('F', 1, N, incy, hy_cpu, hy_host); + hipblas_error_device = norm_check_general('F', 1, N, incy, hy_cpu, hy_device); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR( + hipblasHemvFn(handle, uplo, N, d_alpha, dA, lda, dx, incx, d_beta, dy, incy)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + hemv_gflop_count(N), + hemv_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hemv_batched.hpp b/clients/include/testing_hemv_batched.hpp index 59b4ab7e5..972bcfb24 100644 --- a/clients/include/testing_hemv_batched.hpp +++ b/clients/include/testing_hemv_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -15,7 +15,7 @@ using namespace std; /* ============================================================================================ */ template -hipblasStatus_t testing_hemvBatched(const Arguments& argus) +hipblasStatus_t testing_hemv_batched(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasHemvBatchedFn @@ -36,8 +36,6 @@ hipblasStatus_t testing_hemvBatched(const Arguments& argus) X_size = N * incx; Y_size = N * incy; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || lda < N || incx <= 0 || incy <= 0 || batch_count < 0) @@ -49,110 +47,81 @@ hipblasStatus_t testing_hemvBatched(const Arguments& argus) return HIPBLAS_STATUS_SUCCESS; } - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = (T)argus.alpha; - T beta = (T)argus.beta; + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); // arrays of pointers-to-host on host - host_vector hA_array[batch_count]; - host_vector hx_array[batch_count]; - host_vector hy_array[batch_count]; - host_vector hz_array[batch_count]; - - // arrays of pointers-to-device on host - device_batch_vector bA_array(batch_count, A_size); - device_batch_vector bx_array(batch_count, X_size); - device_batch_vector by_array(batch_count, Y_size); - - // arrays of pointers-to-device on device - device_vector dA_array(batch_count); - device_vector dx_array(batch_count); - device_vector dy_array(batch_count); - - int last = batch_count - 1; - if(!dA_array || !dx_array || !dy_array || (!bA_array[last] && A_size) - || (!bx_array[last] && X_size) || (!by_array[last] && Y_size)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } + host_batch_vector hA(A_size, 1, batch_count); + host_batch_vector hx(N, incx, batch_count); + host_batch_vector hy(N, incy, batch_count); + host_batch_vector hy_host(N, incy, batch_count); + host_batch_vector hy_device(N, incy, batch_count); + host_batch_vector hy_cpu(N, incy, batch_count); + + // device arrays + device_batch_vector dA(A_size, 1, batch_count); + device_batch_vector dx(N, incx, batch_count); + device_batch_vector dy(N, incy, batch_count); + device_vector d_alpha(1); + device_vector d_beta(1); + + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy.memcheck()); // Initial Data on CPU - hipError_t err_A, err_x, err_y; - srand(1); - for(int b = 0; b < batch_count; b++) - { - hA_array[b] = host_vector(A_size); - hx_array[b] = host_vector(X_size); - hy_array[b] = host_vector(Y_size); - hz_array[b] = host_vector(Y_size); - - // initialize matrices on host - srand(1); - hipblas_init(hA_array[b], N, N, lda); - hipblas_init(hx_array[b], 1, N, incx); - hipblas_init(hy_array[b], 1, N, incy); - - hz_array[b] = hy_array[b]; - err_A = hipMemcpy(bA_array[b], hA_array[b], sizeof(T) * A_size, hipMemcpyHostToDevice); - err_x = hipMemcpy(bx_array[b], hx_array[b], sizeof(T) * X_size, hipMemcpyHostToDevice); - err_y = hipMemcpy(by_array[b], hy_array[b], sizeof(T) * Y_size, hipMemcpyHostToDevice); - - if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_MAPPING_ERROR; - } - } + hipblas_init(hA, true); + hipblas_init(hx); + hipblas_init(hy); + hy_cpu.copy_from(hy); - err_A = hipMemcpy(dA_array, bA_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - err_x = hipMemcpy(dx_array, bx_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - err_y = hipMemcpy(dy_array, by_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_MAPPING_ERROR; - } + CHECK_HIP_ERROR(dA.transfer_from(hA)); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHemvBatchedFn(handle, - uplo, - N, - (T*)&alpha, - dA_array, - lda, - dx_array, - incx, - (T*)&beta, - dy_array, - incy, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - hipMemcpy(hy_array[b], by_array[b], sizeof(T) * Y_size, hipMemcpyDeviceToHost); - } - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHemvBatchedFn(handle, + uplo, + N, + (T*)&h_alpha, + dA.ptr_on_device(), + lda, + dx.ptr_on_device(), + incx, + (T*)&h_beta, + dy.ptr_on_device(), + incy, + batch_count)); + + CHECK_HIP_ERROR(hy_host.transfer_from(dy)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHemvBatchedFn(handle, + uplo, + N, + d_alpha, + dA.ptr_on_device(), + lda, + dx.ptr_on_device(), + incx, + d_beta, + dy.ptr_on_device(), + incy, + batch_count)); + + CHECK_HIP_ERROR(hy_device.transfer_from(dy)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -160,18 +129,62 @@ hipblasStatus_t testing_hemvBatched(const Arguments& argus) for(int b = 0; b < batch_count; b++) { - cblas_hemv( - uplo, N, alpha, hA_array[b], lda, hx_array[b], incx, beta, hz_array[b], incy); + cblas_hemv(uplo, N, h_alpha, hA[b], lda, hx[b], incx, h_beta, hy_cpu[b], incy); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, Y_size, batch_count, incy, hz_array, hy_array); + unit_check_general(1, Y_size, batch_count, incy, hy_cpu, hy_host); + unit_check_general(1, Y_size, batch_count, incy, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, N, incy, hy_cpu, hy_host, batch_count); + hipblas_error_device + = norm_check_general('F', 1, N, incy, hy_cpu, hy_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(dy.transfer_from(hy)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHemvBatchedFn(handle, + uplo, + N, + d_alpha, + dA.ptr_on_device(), + lda, + dx.ptr_on_device(), + incx, + d_beta, + dy.ptr_on_device(), + incy, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + hemv_gflop_count(N), + hemv_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hemv_strided_batched.hpp b/clients/include/testing_hemv_strided_batched.hpp index 44d01614a..67c576432 100644 --- a/clients/include/testing_hemv_strided_batched.hpp +++ b/clients/include/testing_hemv_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -15,7 +15,7 @@ using namespace std; /* ============================================================================================ */ template -hipblasStatus_t testing_hemvStridedBatched(const Arguments& argus) +hipblasStatus_t testing_hemv_strided_batched(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasHemvStridedBatchedFn @@ -42,8 +42,6 @@ hipblasStatus_t testing_hemvStridedBatched(const Arguments& argus) X_size = stride_x * batch_count; Y_size = stride_y * batch_count; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || lda < N || incx <= 0 || incy <= 0 || batch_count < 0) @@ -55,21 +53,22 @@ hipblasStatus_t testing_hemvStridedBatched(const Arguments& argus) host_vector hA(A_size); host_vector hx(X_size); host_vector hy(Y_size); - host_vector hz(Y_size); + host_vector hy_cpu(Y_size); + host_vector hy_host(Y_size); + host_vector hy_device(Y_size); device_vector dA(A_size); device_vector dx(X_size); device_vector dy(Y_size); + device_vector d_alpha(1); + device_vector d_beta(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = (T)argus.alpha; - T beta = (T)argus.beta; + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -77,47 +76,59 @@ hipblasStatus_t testing_hemvStridedBatched(const Arguments& argus) hipblas_init(hx, 1, N, incx, stride_x, batch_count); hipblas_init(hy, 1, N, incy, stride_y, batch_count); - // copy vector is easy in STL; hz = hy: save a copy in hz which will be output of CPU BLAS - hz = hy; + // copy vector is easy in STL; hy_cpu = hy: save a copy in hy_cpu which will be output of CPU BLAS + hy_cpu = hy; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * X_size, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * X_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHemvStridedBatchedFn(handle, - uplo, - N, - (T*)&alpha, - dA, - lda, - stride_A, - dx, - incx, - stride_x, - (T*)&beta, - dy, - incy, - stride_y, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - hipMemcpy(hy.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost); - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHemvStridedBatchedFn(handle, + uplo, + N, + (T*)&h_alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + (T*)&h_beta, + dy, + incy, + stride_y, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHemvStridedBatchedFn(handle, + uplo, + N, + d_alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + d_beta, + dy, + incy, + stride_y, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hy_device.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -127,13 +138,13 @@ hipblasStatus_t testing_hemvStridedBatched(const Arguments& argus) { cblas_hemv(uplo, N, - alpha, + h_alpha, hA.data() + b * stride_A, lda, hx.data() + b * stride_x, incx, - beta, - hz.data() + b * stride_y, + h_beta, + hy_cpu.data() + b * stride_y, incy); } @@ -141,10 +152,67 @@ hipblasStatus_t testing_hemvStridedBatched(const Arguments& argus) // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, N, batch_count, incy, stride_y, hz, hy); + unit_check_general(1, N, batch_count, incy, stride_y, hy_cpu, hy_host); + unit_check_general(1, N, batch_count, incy, stride_y, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, N, incy, stride_y, hy_cpu, hy_host, batch_count); + hipblas_error_device + = norm_check_general('F', 1, N, incy, stride_y, hy_cpu, hy_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHemvStridedBatchedFn(handle, + uplo, + N, + d_alpha, + dA, + lda, + stride_A, + dx, + incx, + stride_x, + d_beta, + dy, + incy, + stride_y, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + hemv_gflop_count(N), + hemv_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_her.hpp b/clients/include/testing_her.hpp index 9a7fa1e60..16218d780 100644 --- a/clients/include/testing_her.hpp +++ b/clients/include/testing_her.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -14,9 +14,10 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_her(const Arguments& argus) { + using U = real_t; bool FORTRAN = argus.fortran; auto hipblasHerFn = FORTRAN ? hipblasHer : hipblasHer; @@ -27,8 +28,6 @@ hipblasStatus_t testing_her(const Arguments& argus) int A_size = lda * N; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || lda < N || incx == 0) @@ -38,71 +37,94 @@ hipblasStatus_t testing_her(const Arguments& argus) // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); - host_vector hB(A_size); + host_vector hA_cpu(A_size); + host_vector hA_host(A_size); + host_vector hA_device(A_size); host_vector hx(N * incx); device_vector dA(A_size); device_vector dx(N * incx); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - U alpha = argus.get_alpha(); + U h_alpha = argus.get_alpha(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); hipblas_init(hA, N, N, lda); hipblas_init(hx, 1, N, incx); - // copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS - hB = hA; + // copy matrix is easy in STL; hA_cpu = hA: save a copy in hA_cpu which will be output of CPU BLAS + hA_cpu = hA; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(U), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } - - for(int iter = 0; iter < 1; iter++) - { + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHerFn(handle, uplo, N, (U*)&h_alpha, dx, incx, dA, lda)); - status = hipblasHerFn(handle, uplo, N, (U*)&alpha, dx, incx, dA, lda); + CHECK_HIP_ERROR(hipMemcpy(hA_host.data(), dA, sizeof(T) * N * lda, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * N * lda, hipMemcpyHostToDevice)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHerFn(handle, uplo, N, d_alpha, dx, incx, dA, lda)); - // copy output from device to CPU - hipMemcpy(hA.data(), dA, sizeof(T) * N * lda, hipMemcpyDeviceToHost); + CHECK_HIP_ERROR(hipMemcpy(hA_device.data(), dA, sizeof(T) * N * lda, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ - cblas_her(uplo, N, alpha, hx.data(), incx, hB.data(), lda); + cblas_her(uplo, N, h_alpha, hx.data(), incx, hA_cpu.data(), lda); // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(N, N, lda, hB.data(), hA.data()); + unit_check_general(N, N, lda, hA_cpu.data(), hA_host.data()); + unit_check_general(N, N, lda, hA_cpu.data(), hA_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general('F', N, N, lda, hA_cpu, hA_host); + hipblas_error_device = norm_check_general('F', N, N, lda, hA_cpu, hA_device); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHerFn(handle, uplo, N, d_alpha, dx, incx, dA, lda)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + her_gflop_count(N), + her_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_her2.hpp b/clients/include/testing_her2.hpp index f5a867138..64c3d2329 100644 --- a/clients/include/testing_her2.hpp +++ b/clients/include/testing_her2.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -28,8 +28,6 @@ hipblasStatus_t testing_her2(const Arguments& argus) int A_size = lda * N; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || lda < N || incx == 0 || incy == 0) @@ -39,22 +37,22 @@ hipblasStatus_t testing_her2(const Arguments& argus) // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); - host_vector hB(A_size); + host_vector hA_cpu(A_size); + host_vector hA_host(A_size); + host_vector hA_device(A_size); host_vector hx(N * incx); host_vector hy(N * incy); device_vector dA(A_size); device_vector dx(N * incx); device_vector dy(N * incy); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); + T h_alpha = argus.get_alpha(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -62,52 +60,76 @@ hipblasStatus_t testing_her2(const Arguments& argus) hipblas_init(hx, 1, N, incx); hipblas_init(hy, 1, N, incy); - // copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS - hB = hA; + // copy matrix is easy in STL; hA_cpu = hA: save a copy in hA_cpu which will be output of CPU BLAS + hA_cpu = hA; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } - - for(int iter = 0; iter < 1; iter++) - { + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHer2Fn(handle, uplo, N, (T*)&h_alpha, dx, incx, dy, incy, dA, lda)); - status = hipblasHer2Fn(handle, uplo, N, (T*)&alpha, dx, incx, dy, incy, dA, lda); + CHECK_HIP_ERROR(hipMemcpy(hA_host.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHer2Fn(handle, uplo, N, d_alpha, dx, incx, dy, incy, dA, lda)); - // copy output from device to CPU - hipMemcpy(hA.data(), dA, sizeof(T) * N * lda, hipMemcpyDeviceToHost); + CHECK_HIP_ERROR(hipMemcpy(hA_device.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ - cblas_her2(uplo, N, alpha, hx.data(), incx, hy.data(), incy, hB.data(), lda); + cblas_her2(uplo, N, h_alpha, hx.data(), incx, hy.data(), incy, hA_cpu.data(), lda); // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(N, N, lda, hB.data(), hA.data()); + unit_check_general(N, N, lda, hA_cpu.data(), hA_host.data()); + unit_check_general(N, N, lda, hA_cpu.data(), hA_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general('F', N, N, lda, hA_cpu, hA_host); + hipblas_error_device = norm_check_general('F', N, N, lda, hA_cpu, hA_device); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * lda * N, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR( + hipblasHer2Fn(handle, uplo, N, d_alpha, dx, incx, dy, incy, dA, lda)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + her2_gflop_count(N), + her2_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_her2_batched.hpp b/clients/include/testing_her2_batched.hpp index 515491723..e7520eefe 100644 --- a/clients/include/testing_her2_batched.hpp +++ b/clients/include/testing_her2_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -32,13 +32,9 @@ hipblasStatus_t testing_her2_batched(const Arguments& argus) int y_size = N * incy; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); - - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + T h_alpha = argus.get_alpha(); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -51,97 +47,132 @@ hipblasStatus_t testing_her2_batched(const Arguments& argus) return HIPBLAS_STATUS_SUCCESS; } - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory - host_vector hA[batch_count]; - host_vector hB[batch_count]; - host_vector hx[batch_count]; - host_vector hy[batch_count]; - - device_batch_vector bA(batch_count, A_size); - device_batch_vector bx(batch_count, x_size); - device_batch_vector by(batch_count, y_size); - - device_vector dA(batch_count); - device_vector dx(batch_count); - device_vector dy(batch_count); - - int last = batch_count - 1; - if(!dA || !dx || !dy || (!bA[last] && A_size) || (!bx[last] && x_size) || (!by[last] && y_size)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } + host_batch_vector hA(A_size, 1, batch_count); + host_batch_vector hA_cpu(A_size, 1, batch_count); + host_batch_vector hA_host(A_size, 1, batch_count); + host_batch_vector hA_device(A_size, 1, batch_count); + host_batch_vector hx(N, incx, batch_count); + host_batch_vector hy(N, incy, batch_count); + + device_batch_vector dA(A_size, 1, batch_count); + device_batch_vector dx(N, incx, batch_count); + device_batch_vector dy(N, incy, batch_count); + device_vector d_alpha(1); + + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy.memcheck()); // Initial Data on CPU - srand(1); - for(int b = 0; b < batch_count; b++) - { - hA[b] = host_vector(A_size); - hB[b] = host_vector(A_size); - hx[b] = host_vector(x_size); - hy[b] = host_vector(y_size); - - srand(1); - hipblas_init(hA[b], N, N, lda); - hipblas_init(hx[b], 1, N, incx); - hipblas_init(hy[b], 1, N, incy); - hB[b] = hA[b]; - - CHECK_HIP_ERROR(hipMemcpy(bA[b], hA[b], sizeof(T) * A_size, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(bx[b], hx[b], sizeof(T) * x_size, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(by[b], hy[b], sizeof(T) * y_size, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR(hipMemcpy(dA, bA, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dx, bx, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy, by, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); + hipblas_init(hA, true); + hipblas_init(hx); + hipblas_init(hy); + + hA_cpu.copy_from(hA); + CHECK_HIP_ERROR(dA.transfer_from(hA)); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } - - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHer2BatchedFn( - handle, uplo, N, (T*)&alpha, dx, incx, dy, incy, dA, lda, batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - hipMemcpy(hA[b], bA[b], sizeof(T) * A_size, hipMemcpyDeviceToHost); - } - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHer2BatchedFn(handle, + uplo, + N, + (T*)&h_alpha, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + dA.ptr_on_device(), + lda, + batch_count)); + + CHECK_HIP_ERROR(hA_host.transfer_from(dA)); + CHECK_HIP_ERROR(dA.transfer_from(hA)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHer2BatchedFn(handle, + uplo, + N, + d_alpha, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + dA.ptr_on_device(), + lda, + batch_count)); + + CHECK_HIP_ERROR(hA_device.transfer_from(dA)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_her2(uplo, N, alpha, hx[b], incx, hy[b], incy, hB[b], lda); + cblas_her2(uplo, N, h_alpha, hx[b], incx, hy[b], incy, hA_cpu[b], lda); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(N, N, batch_count, lda, hB, hA); + unit_check_general(N, N, batch_count, lda, hA_cpu, hA_host); + unit_check_general(N, N, batch_count, lda, hA_cpu, hA_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', N, N, lda, hA_cpu, hA_host, batch_count); + hipblas_error_device + = norm_check_general('F', N, N, lda, hA_cpu, hA_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(dA.transfer_from(hA)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHer2BatchedFn(handle, + uplo, + N, + d_alpha, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + dA.ptr_on_device(), + lda, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + her2_gflop_count(N), + her2_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_her2_strided_batched.hpp b/clients/include/testing_her2_strided_batched.hpp index a510ef8a1..f34ef185f 100644 --- a/clients/include/testing_her2_strided_batched.hpp +++ b/clients/include/testing_her2_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -36,8 +36,6 @@ hipblasStatus_t testing_her2_strided_batched(const Arguments& argus) int y_size = stride_y * batch_count; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || lda < N || incx == 0 || incy == 0 || batch_count < 0) @@ -51,22 +49,22 @@ hipblasStatus_t testing_her2_strided_batched(const Arguments& argus) // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); - host_vector hB(A_size); + host_vector hA_cpu(A_size); + host_vector hA_host(A_size); + host_vector hA_device(A_size); host_vector hx(x_size); host_vector hy(y_size); device_vector dA(A_size); device_vector dx(x_size); device_vector dy(y_size); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); + T h_alpha = argus.get_alpha(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -74,50 +72,56 @@ hipblasStatus_t testing_her2_strided_batched(const Arguments& argus) hipblas_init(hx, 1, N, incx, stride_x, batch_count); hipblas_init(hy, 1, N, incy, stride_y, batch_count); - // copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS - hB = hA; + // copy matrix is easy in STL; hA_cpu = hA: save a copy in hA_cpu which will be output of CPU BLAS + hA_cpu = hA; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * y_size, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * y_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } - - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHer2StridedBatchedFn(handle, - uplo, - N, - (T*)&alpha, - dx, - incx, - stride_x, - dy, - incy, - stride_y, - dA, - lda, - stride_A, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - hipMemcpy(hA.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost); - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHer2StridedBatchedFn(handle, + uplo, + N, + (T*)&h_alpha, + dx, + incx, + stride_x, + dy, + incy, + stride_y, + dA, + lda, + stride_A, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hA_host.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHer2StridedBatchedFn(handle, + uplo, + N, + d_alpha, + dx, + incx, + stride_x, + dy, + incy, + stride_y, + dA, + lda, + stride_A, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hA_device.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -126,12 +130,12 @@ hipblasStatus_t testing_her2_strided_batched(const Arguments& argus) { cblas_her2(uplo, N, - alpha, + h_alpha, hx.data() + b * stride_x, incx, hy.data() + b * stride_y, incy, - hB.data() + b * stride_A, + hA_cpu.data() + b * stride_A, lda); } @@ -139,10 +143,66 @@ hipblasStatus_t testing_her2_strided_batched(const Arguments& argus) // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(N, N, batch_count, lda, stride_A, hB.data(), hA.data()); + unit_check_general(N, N, batch_count, lda, stride_A, hA_cpu.data(), hA_host.data()); + unit_check_general( + N, N, batch_count, lda, stride_A, hA_cpu.data(), hA_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general( + 'F', N, N, lda, stride_A, hA_cpu.data(), hA_host.data(), batch_count); + hipblas_error_device = norm_check_general( + 'F', N, N, lda, stride_A, hA_cpu.data(), hA_device.data(), batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHer2StridedBatchedFn(handle, + uplo, + N, + d_alpha, + dx, + incx, + stride_x, + dy, + incy, + stride_y, + dA, + lda, + stride_A, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + her2_gflop_count(N), + her2_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_her_batched.hpp b/clients/include/testing_her_batched.hpp index 066f6009c..40cc061fe 100644 --- a/clients/include/testing_her_batched.hpp +++ b/clients/include/testing_her_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -14,9 +14,10 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_her_batched(const Arguments& argus) { + using U = real_t; bool FORTRAN = argus.fortran; auto hipblasHerBatchedFn = FORTRAN ? hipblasHerBatched : hipblasHerBatched; @@ -30,13 +31,9 @@ hipblasStatus_t testing_her_batched(const Arguments& argus) int x_size = N * incx; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - U alpha = argus.get_alpha(); - - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + U h_alpha = argus.get_alpha(); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -49,89 +46,114 @@ hipblasStatus_t testing_her_batched(const Arguments& argus) return HIPBLAS_STATUS_SUCCESS; } - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory - host_vector hA[batch_count]; - host_vector hB[batch_count]; - host_vector hx[batch_count]; - - device_batch_vector bA(batch_count, A_size); - device_batch_vector bx(batch_count, x_size); + host_batch_vector hA(A_size, 1, batch_count); + host_batch_vector hA_cpu(A_size, 1, batch_count); + host_batch_vector hA_host(A_size, 1, batch_count); + host_batch_vector hA_device(A_size, 1, batch_count); + host_batch_vector hx(N, incx, batch_count); - device_vector dA(batch_count); - device_vector dx(batch_count); + device_batch_vector dA(A_size, 1, batch_count); + device_batch_vector dx(N, incx, batch_count); + device_vector d_alpha(1); - int last = batch_count - 1; - if(!dA || !dx || (!bA[last] && A_size) || (!bx[last] && x_size)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dx.memcheck()); // Initial Data on CPU - srand(1); - for(int b = 0; b < batch_count; b++) - { - hA[b] = host_vector(A_size); - hB[b] = host_vector(A_size); - hx[b] = host_vector(x_size); - - srand(1); - hipblas_init(hA[b], N, N, lda); - hipblas_init(hx[b], 1, N, incx); - hB[b] = hA[b]; + hipblas_init(hA, true); + hipblas_init(hx); - CHECK_HIP_ERROR(hipMemcpy(bA[b], hA[b], sizeof(T) * A_size, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(bx[b], hx[b], sizeof(T) * x_size, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR(hipMemcpy(dA, bA, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dx, bx, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); + hA_cpu.copy_from(hA); + CHECK_HIP_ERROR(dA.transfer_from(hA)); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(U), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } - - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHerBatchedFn(handle, uplo, N, (U*)&alpha, dx, incx, dA, lda, batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - hipMemcpy(hA[b], bA[b], sizeof(T) * A_size, hipMemcpyDeviceToHost); - } - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHerBatchedFn(handle, + uplo, + N, + (U*)&h_alpha, + dx.ptr_on_device(), + incx, + dA.ptr_on_device(), + lda, + batch_count)); + + CHECK_HIP_ERROR(hA_host.transfer_from(dA)); + CHECK_HIP_ERROR(dA.transfer_from(hA)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHerBatchedFn( + handle, uplo, N, d_alpha, dx.ptr_on_device(), incx, dA.ptr_on_device(), lda, batch_count)); + + CHECK_HIP_ERROR(hA_device.transfer_from(dA)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_her(uplo, N, alpha, hx[b], incx, hB[b], lda); + cblas_her(uplo, N, h_alpha, hx[b], incx, hA_cpu[b], lda); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(N, N, batch_count, lda, hB, hA); + unit_check_general(N, N, batch_count, lda, hA_cpu, hA_host); + unit_check_general(N, N, batch_count, lda, hA_cpu, hA_host); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', N, N, lda, hA_cpu, hA_host, batch_count); + hipblas_error_device + = norm_check_general('F', N, N, lda, hA_cpu, hA_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(dA.transfer_from(hA)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHerBatchedFn(handle, + uplo, + N, + d_alpha, + dx.ptr_on_device(), + incx, + dA.ptr_on_device(), + lda, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + her_gflop_count(N), + her_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_her_strided_batched.hpp b/clients/include/testing_her_strided_batched.hpp index d8a40f7c8..4bd07aedb 100644 --- a/clients/include/testing_her_strided_batched.hpp +++ b/clients/include/testing_her_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -14,9 +14,10 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_her_strided_batched(const Arguments& argus) { + using U = real_t; bool FORTRAN = argus.fortran; auto hipblasHerStridedBatchedFn = FORTRAN ? hipblasHerStridedBatched : hipblasHerStridedBatched; @@ -33,8 +34,6 @@ hipblasStatus_t testing_her_strided_batched(const Arguments& argus) int x_size = stride_x * batch_count; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || lda < N || incx == 0 || batch_count < 0) @@ -48,75 +47,110 @@ hipblasStatus_t testing_her_strided_batched(const Arguments& argus) // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); - host_vector hB(A_size); + host_vector hA_cpu(A_size); + host_vector hA_host(A_size); + host_vector hA_device(A_size); host_vector hx(x_size); device_vector dA(A_size); device_vector dx(x_size); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - U alpha = argus.get_alpha(); + U h_alpha = argus.get_alpha(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); hipblas_init(hA, N, N, lda, stride_A, batch_count); hipblas_init(hx, 1, N, incx, stride_x, batch_count); - // copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS - hB = hA; + // copy matrix is easy in STL; hA_cpu = hA: save a copy in hA_cpu which will be output of CPU BLAS + hA_cpu = hA; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(U), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHerStridedBatchedFn( + handle, uplo, N, (U*)&h_alpha, dx, incx, stride_x, dA, lda, stride_A, batch_count)); - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHerStridedBatchedFn( - handle, uplo, N, (U*)&alpha, dx, incx, stride_x, dA, lda, stride_A, batch_count); + CHECK_HIP_ERROR(hipMemcpy(hA_host.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHerStridedBatchedFn( + handle, uplo, N, d_alpha, dx, incx, stride_x, dA, lda, stride_A, batch_count)); - // copy output from device to CPU - hipMemcpy(hA.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost); + CHECK_HIP_ERROR(hipMemcpy(hA_device.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_her( - uplo, N, alpha, hx.data() + b * stride_x, incx, hB.data() + b * stride_A, lda); + cblas_her(uplo, + N, + h_alpha, + hx.data() + b * stride_x, + incx, + hA_cpu.data() + b * stride_A, + lda); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(N, N, batch_count, lda, stride_A, hB.data(), hA.data()); + unit_check_general(N, N, batch_count, lda, stride_A, hA_cpu.data(), hA_host.data()); + unit_check_general( + N, N, batch_count, lda, stride_A, hA_cpu.data(), hA_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general( + 'F', N, N, lda, stride_A, hA_cpu.data(), hA_host.data(), batch_count); + hipblas_error_device = norm_check_general( + 'F', N, N, lda, stride_A, hA_cpu.data(), hA_device.data(), batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHerStridedBatchedFn( + handle, uplo, N, d_alpha, dx, incx, stride_x, dA, lda, stride_A, batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + her_gflop_count(N), + her_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hpmv.hpp b/clients/include/testing_hpmv.hpp index 2d49c1254..59883a9de 100644 --- a/clients/include/testing_hpmv.hpp +++ b/clients/include/testing_hpmv.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -26,36 +26,35 @@ hipblasStatus_t testing_hpmv(const Arguments& argus) int A_size = N * (N + 1) / 2; - hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx == 0 || incy == 0) { - status = HIPBLAS_STATUS_INVALID_VALUE; - return status; + return HIPBLAS_STATUS_INVALID_VALUE; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); host_vector hx(N * incx); host_vector hy(N * incy); - host_vector hz(N * incy); + host_vector hy_cpu(N * incy); + host_vector hy_host(N * incy); + host_vector hy_device(N * incy); device_vector dA(A_size); device_vector dx(N * incx); device_vector dy(N * incy); + device_vector d_alpha(1); + device_vector d_beta(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); - T beta = argus.get_beta(); + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -63,47 +62,79 @@ hipblasStatus_t testing_hpmv(const Arguments& argus) hipblas_init(hx, 1, N, incx); hipblas_init(hy, 1, N, incy); - // copy vector is easy in STL; hz = hy: save a copy in hz which will be output of CPU BLAS - hz = hy; + // copy vector is easy in STL; hy_cpu = hy: save a copy in hy_cpu which will be output of CPU BLAS + hy_cpu = hy; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHpmvFn(handle, uplo, N, (T*)&alpha, dA, dx, incx, (T*)&beta, dy, incy); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR( + hipblasHpmvFn(handle, uplo, N, (T*)&h_alpha, dA, dx, incx, (T*)&h_beta, dy, incy)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy, sizeof(T) * N * incy, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); - // copy output from device to CPU - hipMemcpy(hy.data(), dy, sizeof(T) * N * incy, hipMemcpyDeviceToHost); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHpmvFn(handle, uplo, N, d_alpha, dA, dx, incx, d_beta, dy, incy)); - if(argus.unit_check) + CHECK_HIP_ERROR(hipMemcpy(hy_device.data(), dy, sizeof(T) * N * incy, hipMemcpyDeviceToHost)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ - cblas_hpmv(uplo, N, alpha, hA.data(), hx.data(), incx, beta, hz.data(), incy); + cblas_hpmv(uplo, N, h_alpha, hA.data(), hx.data(), incx, h_beta, hy_cpu.data(), incy); // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, N, incy, hz, hy); + unit_check_general(1, N, incy, hy_cpu, hy_host); + unit_check_general(1, N, incy, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general('F', 1, N, incy, hy_cpu, hy_host); + hipblas_error_device = norm_check_general('F', 1, N, incy, hy_cpu, hy_device); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR( + hipblasHpmvFn(handle, uplo, N, d_alpha, dA, dx, incx, d_beta, dy, incy)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + hpmv_gflop_count(N), + hpmv_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hpmv_batched.hpp b/clients/include/testing_hpmv_batched.hpp index 39ff3d36c..5df4a9bea 100644 --- a/clients/include/testing_hpmv_batched.hpp +++ b/clients/include/testing_hpmv_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -35,8 +35,6 @@ hipblasStatus_t testing_hpmv_batched(const Arguments& argus) X_size = N * incx; Y_size = N * incy; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx <= 0 || incy <= 0 || batch_count < 0) @@ -48,109 +46,79 @@ hipblasStatus_t testing_hpmv_batched(const Arguments& argus) return HIPBLAS_STATUS_SUCCESS; } - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); - T beta = argus.get_beta(); + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); // arrays of pointers-to-host on host - host_vector hA_array[batch_count]; - host_vector hx_array[batch_count]; - host_vector hy_array[batch_count]; - host_vector hz_array[batch_count]; + host_batch_vector hA(A_size, 1, batch_count); + host_batch_vector hx(N, incx, batch_count); + host_batch_vector hy(N, incy, batch_count); + host_batch_vector hy_cpu(N, incy, batch_count); + host_batch_vector hy_host(N, incy, batch_count); + host_batch_vector hy_device(N, incy, batch_count); // arrays of pointers-to-device on host - device_batch_vector bA_array(batch_count, A_size); - device_batch_vector bx_array(batch_count, X_size); - device_batch_vector by_array(batch_count, Y_size); - - // arrays of pointers-to-device on device - device_vector dA_array(batch_count); - device_vector dx_array(batch_count); - device_vector dy_array(batch_count); - - int last = batch_count - 1; - if(!dA_array || !dx_array || !dy_array || (!bA_array[last] && A_size) - || (!bx_array[last] && X_size) || (!by_array[last] && Y_size)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } + device_batch_vector dA(A_size, 1, batch_count); + device_batch_vector dx(N, incx, batch_count); + device_batch_vector dy(N, incy, batch_count); + device_vector d_alpha(1); + device_vector d_beta(1); + + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy.memcheck()); // Initial Data on CPU - hipError_t err_A, err_x, err_y; - srand(1); - for(int b = 0; b < batch_count; b++) - { - hA_array[b] = host_vector(A_size); - hx_array[b] = host_vector(X_size); - hy_array[b] = host_vector(Y_size); - hz_array[b] = host_vector(Y_size); - - // initialize matrices on host - srand(1); - hipblas_init(hA_array[b], 1, A_size, 1); - hipblas_init(hx_array[b], 1, N, incx); - hipblas_init(hy_array[b], 1, N, incy); - - hz_array[b] = hy_array[b]; - err_A = hipMemcpy(bA_array[b], hA_array[b], sizeof(T) * A_size, hipMemcpyHostToDevice); - err_x = hipMemcpy(bx_array[b], hx_array[b], sizeof(T) * X_size, hipMemcpyHostToDevice); - err_y = hipMemcpy(by_array[b], hy_array[b], sizeof(T) * Y_size, hipMemcpyHostToDevice); - - if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_MAPPING_ERROR; - } - } + hipblas_init(hA, true); + hipblas_init(hx); + hipblas_init(hy); + hy_cpu.copy_from(hy); - err_A = hipMemcpy(dA_array, bA_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - err_x = hipMemcpy(dx_array, bx_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - err_y = hipMemcpy(dy_array, by_array, batch_count * sizeof(T*), hipMemcpyHostToDevice); - if(err_A != hipSuccess || err_x != hipSuccess || err_y != hipSuccess) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_MAPPING_ERROR; - } + CHECK_HIP_ERROR(dA.transfer_from(hA)); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHpmvBatchedFn(handle, - uplo, - N, - (T*)&alpha, - dA_array, - dx_array, - incx, - (T*)&beta, - dy_array, - incy, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - hipMemcpy(hy_array[b], by_array[b], sizeof(T) * Y_size, hipMemcpyDeviceToHost); - } - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHpmvBatchedFn(handle, + uplo, + N, + (T*)&h_alpha, + dA.ptr_on_device(), + dx.ptr_on_device(), + incx, + (T*)&h_beta, + dy.ptr_on_device(), + incy, + batch_count)); + + CHECK_HIP_ERROR(hy_host.transfer_from(dy)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHpmvBatchedFn(handle, + uplo, + N, + d_alpha, + dA.ptr_on_device(), + dx.ptr_on_device(), + incx, + d_beta, + dy.ptr_on_device(), + incy, + batch_count)); + + CHECK_HIP_ERROR(hy_device.transfer_from(dy)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -158,17 +126,61 @@ hipblasStatus_t testing_hpmv_batched(const Arguments& argus) for(int b = 0; b < batch_count; b++) { - cblas_hpmv(uplo, N, alpha, hA_array[b], hx_array[b], incx, beta, hz_array[b], incy); + cblas_hpmv(uplo, N, h_alpha, hA[b], hx[b], incx, h_beta, hy_cpu[b], incy); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, Y_size, batch_count, incy, hz_array, hy_array); + unit_check_general(1, Y_size, batch_count, incy, hy_cpu, hy_host); + unit_check_general(1, Y_size, batch_count, incy, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, N, incy, hy_cpu, hy_host, batch_count); + hipblas_error_device + = norm_check_general('F', 1, N, incy, hy_cpu, hy_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(dy.transfer_from(hy)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHpmvBatchedFn(handle, + uplo, + N, + d_alpha, + dA.ptr_on_device(), + dx.ptr_on_device(), + incx, + d_beta, + dy.ptr_on_device(), + incy, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + hpmv_gflop_count(N), + hpmv_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hpmv_strided_batched.hpp b/clients/include/testing_hpmv_strided_batched.hpp index 8b9b9b263..21be9d1d2 100644 --- a/clients/include/testing_hpmv_strided_batched.hpp +++ b/clients/include/testing_hpmv_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -42,8 +42,6 @@ hipblasStatus_t testing_hpmv_strided_batched(const Arguments& argus) X_size = stride_x * batch_count; Y_size = stride_y * batch_count; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx <= 0 || incy <= 0 || batch_count < 0) @@ -55,21 +53,22 @@ hipblasStatus_t testing_hpmv_strided_batched(const Arguments& argus) host_vector hA(A_size); host_vector hx(X_size); host_vector hy(Y_size); - host_vector hz(Y_size); + host_vector hy_cpu(Y_size); + host_vector hy_host(Y_size); + host_vector hy_device(Y_size); device_vector dA(A_size); device_vector dx(X_size); device_vector dy(Y_size); + device_vector d_alpha(1); + device_vector d_beta(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); - T beta = argus.get_beta(); + T h_alpha = argus.get_alpha(); + T h_beta = argus.get_beta(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -77,46 +76,57 @@ hipblasStatus_t testing_hpmv_strided_batched(const Arguments& argus) hipblas_init(hx, 1, N, incx, stride_x, batch_count); hipblas_init(hy, 1, N, incy, stride_y, batch_count); - // copy vector is easy in STL; hz = hy: save a copy in hz which will be output of CPU BLAS - hz = hy; + // copy vector is easy in STL; hy_cpu = hy: save a copy in hy_cpu which will be output of CPU BLAS + hy_cpu = hy; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * X_size, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * X_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHpmvStridedBatchedFn(handle, - uplo, - N, - (T*)&alpha, - dA, - stride_A, - dx, - incx, - stride_x, - (T*)&beta, - dy, - incy, - stride_y, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - hipMemcpy(hy.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost); - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHpmvStridedBatchedFn(handle, + uplo, + N, + (T*)&h_alpha, + dA, + stride_A, + dx, + incx, + stride_x, + (T*)&h_beta, + dy, + incy, + stride_y, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHpmvStridedBatchedFn(handle, + uplo, + N, + d_alpha, + dA, + stride_A, + dx, + incx, + stride_x, + d_beta, + dy, + incy, + stride_y, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hy_device.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -126,12 +136,12 @@ hipblasStatus_t testing_hpmv_strided_batched(const Arguments& argus) { cblas_hpmv(uplo, N, - alpha, + h_alpha, hA.data() + b * stride_A, hx.data() + b * stride_x, incx, - beta, - hz.data() + b * stride_y, + h_beta, + hy_cpu.data() + b * stride_y, incy); } @@ -139,10 +149,65 @@ hipblasStatus_t testing_hpmv_strided_batched(const Arguments& argus) // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, N, batch_count, incy, stride_y, hz, hy); + unit_check_general(1, N, batch_count, incy, stride_y, hy_cpu, hy_host); + unit_check_general(1, N, batch_count, incy, stride_y, hy_cpu, hy_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, N, incy, stride_y, hy_cpu, hy_host, batch_count); + hipblas_error_device + = norm_check_general('F', 1, N, incy, stride_y, hy_cpu, hy_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHpmvStridedBatchedFn(handle, + uplo, + N, + d_alpha, + dA, + stride_A, + dx, + incx, + stride_x, + d_beta, + dy, + incy, + stride_y, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + hpmv_gflop_count(N), + hpmv_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hpr.hpp b/clients/include/testing_hpr.hpp index 779ff7436..e78226416 100644 --- a/clients/include/testing_hpr.hpp +++ b/clients/include/testing_hpr.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -14,9 +14,10 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_hpr(const Arguments& argus) { + using U = real_t; bool FORTRAN = argus.fortran; auto hipblasHprFn = FORTRAN ? hipblasHpr : hipblasHpr; @@ -26,8 +27,6 @@ hipblasStatus_t testing_hpr(const Arguments& argus) int A_size = N * (N + 1) / 2; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx == 0) @@ -37,71 +36,94 @@ hipblasStatus_t testing_hpr(const Arguments& argus) // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); - host_vector hB(A_size); + host_vector hA_cpu(A_size); + host_vector hA_host(A_size); + host_vector hA_device(A_size); host_vector hx(N * incx); device_vector dA(A_size); device_vector dx(N * incx); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - U alpha = argus.get_alpha(); + U h_alpha = argus.get_alpha(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); hipblas_init(hA, 1, A_size, 1); hipblas_init(hx, 1, N, incx); - // copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS - hB = hA; + // copy matrix is easy in STL; hA_cpu = hA: save a copy in hA_cpu which will be output of CPU BLAS + hA_cpu = hA; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(U), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } - - for(int iter = 0; iter < 1; iter++) - { + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHprFn(handle, uplo, N, (U*)&h_alpha, dx, incx, dA)); - status = hipblasHprFn(handle, uplo, N, (U*)&alpha, dx, incx, dA); + CHECK_HIP_ERROR(hipMemcpy(hA_host.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHprFn(handle, uplo, N, d_alpha, dx, incx, dA)); - // copy output from device to CPU - hipMemcpy(hA.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost); + CHECK_HIP_ERROR(hipMemcpy(hA_device.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ - cblas_hpr(uplo, N, alpha, hx.data(), incx, hB.data()); + cblas_hpr(uplo, N, h_alpha, hx.data(), incx, hA_cpu.data()); // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, A_size, 1, hB.data(), hA.data()); + unit_check_general(1, A_size, 1, hA_cpu.data(), hA_host.data()); + unit_check_general(1, A_size, 1, hA_cpu.data(), hA_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general('F', 1, A_size, 1, hA_cpu, hA_host); + hipblas_error_device = norm_check_general('F', 1, A_size, 1, hA_cpu, hA_device); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHprFn(handle, uplo, N, d_alpha, dx, incx, dA)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + hpr_gflop_count(N), + hpr_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hpr2.hpp b/clients/include/testing_hpr2.hpp index c6fb42cb9..7a0b9e20f 100644 --- a/clients/include/testing_hpr2.hpp +++ b/clients/include/testing_hpr2.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -27,8 +27,6 @@ hipblasStatus_t testing_hpr2(const Arguments& argus) int A_size = N * (N + 1) / 2; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx == 0 || incy == 0) @@ -38,22 +36,22 @@ hipblasStatus_t testing_hpr2(const Arguments& argus) // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); - host_vector hB(A_size); + host_vector hA_cpu(A_size); + host_vector hA_host(A_size); + host_vector hA_device(A_size); host_vector hx(N * incx); host_vector hy(N * incy); device_vector dA(A_size); device_vector dx(N * incx); device_vector dy(N * incy); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); + T h_alpha = argus.get_alpha(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -61,52 +59,77 @@ hipblasStatus_t testing_hpr2(const Arguments& argus) hipblas_init(hx, 1, N, incx); hipblas_init(hy, 1, N, incy); - // copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS - hB = hA; + // copy matrix is easy in STL; hA_cpu = hA: save a copy in hA_cpu which will be output of CPU BLAS + hA_cpu = hA; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } - - for(int iter = 0; iter < 1; iter++) - { + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHpr2Fn(handle, uplo, N, (T*)&h_alpha, dx, incx, dy, incy, dA)); - status = hipblasHpr2Fn(handle, uplo, N, (T*)&alpha, dx, incx, dy, incy, dA); + CHECK_HIP_ERROR(hipMemcpy(hA_host.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHpr2Fn(handle, uplo, N, d_alpha, dx, incx, dy, incy, dA)); - // copy output from device to CPU - hipMemcpy(hA.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost); + CHECK_HIP_ERROR(hipMemcpy(hA_device.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ - cblas_hpr2(uplo, N, alpha, hx.data(), incx, hy.data(), incy, hB.data()); + cblas_hpr2(uplo, N, h_alpha, hx.data(), incx, hy.data(), incy, hA_cpu.data()); // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, A_size, 1, hB.data(), hA.data()); + unit_check_general(1, A_size, 1, hA_cpu.data(), hA_host.data()); + unit_check_general(1, A_size, 1, hA_cpu.data(), hA_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, A_size, 1, hA_cpu.data(), hA_host.data()); + hipblas_error_device + = norm_check_general('F', 1, A_size, 1, hA_cpu.data(), hA_device.data()); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHpr2Fn(handle, uplo, N, d_alpha, dx, incx, dy, incy, dA)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + hpr2_gflop_count(N), + hpr2_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hpr2_batched.hpp b/clients/include/testing_hpr2_batched.hpp index c6bc1b50f..8e9ad6ac3 100644 --- a/clients/include/testing_hpr2_batched.hpp +++ b/clients/include/testing_hpr2_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -31,13 +31,9 @@ hipblasStatus_t testing_hpr2_batched(const Arguments& argus) int y_size = N * incy; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); - - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + T h_alpha = argus.get_alpha(); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -50,97 +46,129 @@ hipblasStatus_t testing_hpr2_batched(const Arguments& argus) return HIPBLAS_STATUS_SUCCESS; } - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory - host_vector hA[batch_count]; - host_vector hB[batch_count]; - host_vector hx[batch_count]; - host_vector hy[batch_count]; - - device_batch_vector bA(batch_count, A_size); - device_batch_vector bx(batch_count, x_size); - device_batch_vector by(batch_count, y_size); - - device_vector dA(batch_count); - device_vector dx(batch_count); - device_vector dy(batch_count); - - int last = batch_count - 1; - if(!dA || !dx || !dy || (!bA[last] && A_size) || (!bx[last] && x_size) || (!by[last] && y_size)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } + host_batch_vector hA(A_size, 1, batch_count); + host_batch_vector hA_cpu(A_size, 1, batch_count); + host_batch_vector hA_host(A_size, 1, batch_count); + host_batch_vector hA_device(A_size, 1, batch_count); + host_batch_vector hx(N, incx, batch_count); + host_batch_vector hy(N, incy, batch_count); + + device_batch_vector dA(A_size, 1, batch_count); + device_batch_vector dx(N, incx, batch_count); + device_batch_vector dy(N, incy, batch_count); + device_vector d_alpha(1); + + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy.memcheck()); // Initial Data on CPU - srand(1); - for(int b = 0; b < batch_count; b++) - { - hA[b] = host_vector(A_size); - hB[b] = host_vector(A_size); - hx[b] = host_vector(x_size); - hy[b] = host_vector(y_size); - - srand(1); - hipblas_init(hA[b], 1, A_size, 1); - hipblas_init(hx[b], 1, N, incx); - hipblas_init(hy[b], 1, N, incy); - hB[b] = hA[b]; - - CHECK_HIP_ERROR(hipMemcpy(bA[b], hA[b], sizeof(T) * A_size, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(bx[b], hx[b], sizeof(T) * x_size, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(by[b], hy[b], sizeof(T) * y_size, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR(hipMemcpy(dA, bA, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dx, bx, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy, by, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); + hipblas_init(hA, true); + hipblas_init(hx); + hipblas_init(hy); + + hA_cpu.copy_from(hA); + CHECK_HIP_ERROR(dA.transfer_from(hA)); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } - - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHpr2BatchedFn( - handle, uplo, N, (T*)&alpha, dx, incx, dy, incy, dA, batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - hipMemcpy(hA[b], bA[b], sizeof(T) * A_size, hipMemcpyDeviceToHost); - } - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHpr2BatchedFn(handle, + uplo, + N, + (T*)&h_alpha, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + dA.ptr_on_device(), + batch_count)); + + CHECK_HIP_ERROR(hA_host.transfer_from(dA)); + CHECK_HIP_ERROR(dA.transfer_from(hA)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHpr2BatchedFn(handle, + uplo, + N, + d_alpha, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + dA.ptr_on_device(), + batch_count)); + + CHECK_HIP_ERROR(hA_device.transfer_from(dA)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_hpr2(uplo, N, alpha, hx[b], incx, hy[b], incy, hB[b]); + cblas_hpr2(uplo, N, h_alpha, hx[b], incx, hy[b], incy, hA_cpu[b]); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, A_size, batch_count, 1, hB, hA); + unit_check_general(1, A_size, batch_count, 1, hA_cpu, hA_host); + unit_check_general(1, A_size, batch_count, 1, hA_cpu, hA_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, A_size, 1, hA_cpu, hA_host, batch_count); + hipblas_error_device + = norm_check_general('F', 1, A_size, 1, hA_cpu, hA_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(dA.transfer_from(hA)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHpr2BatchedFn(handle, + uplo, + N, + d_alpha, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + dA.ptr_on_device(), + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + hpr2_gflop_count(N), + hpr2_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hpr2_strided_batched.hpp b/clients/include/testing_hpr2_strided_batched.hpp index 773595680..fd4ba16e6 100644 --- a/clients/include/testing_hpr2_strided_batched.hpp +++ b/clients/include/testing_hpr2_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -36,8 +36,6 @@ hipblasStatus_t testing_hpr2_strided_batched(const Arguments& argus) int y_size = stride_y * batch_count; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx == 0 || incy == 0 || batch_count < 0) @@ -51,22 +49,22 @@ hipblasStatus_t testing_hpr2_strided_batched(const Arguments& argus) // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); - host_vector hB(A_size); + host_vector hA_cpu(A_size); + host_vector hA_host(A_size); + host_vector hA_device(A_size); host_vector hx(x_size); host_vector hy(y_size); device_vector dA(A_size); device_vector dx(x_size); device_vector dy(y_size); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - T alpha = argus.get_alpha(); + T h_alpha = argus.get_alpha(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -74,49 +72,54 @@ hipblasStatus_t testing_hpr2_strided_batched(const Arguments& argus) hipblas_init(hx, 1, N, incx, stride_x, batch_count); hipblas_init(hy, 1, N, incy, stride_y, batch_count); - // copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS - hB = hA; + // copy matrix is easy in STL; hA_cpu = hA: save a copy in hA_cpu which will be output of CPU BLAS + hA_cpu = hA; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice); - hipMemcpy(dy, hy.data(), sizeof(T) * y_size, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * y_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } - - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHpr2StridedBatchedFn(handle, - uplo, - N, - (T*)&alpha, - dx, - incx, - stride_x, - dy, - incy, - stride_y, - dA, - stride_A, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } - - // copy output from device to CPU - hipMemcpy(hA.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost); - - if(argus.unit_check) + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHpr2StridedBatchedFn(handle, + uplo, + N, + (T*)&h_alpha, + dx, + incx, + stride_x, + dy, + incy, + stride_y, + dA, + stride_A, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hA_host.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHpr2StridedBatchedFn(handle, + uplo, + N, + d_alpha, + dx, + incx, + stride_x, + dy, + incy, + stride_y, + dA, + stride_A, + batch_count)); + + CHECK_HIP_ERROR(hipMemcpy(hA_device.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -125,22 +128,77 @@ hipblasStatus_t testing_hpr2_strided_batched(const Arguments& argus) { cblas_hpr2(uplo, N, - alpha, + h_alpha, hx.data() + b * stride_x, incx, hy.data() + b * stride_y, incy, - hB.data() + b * stride_A); + hA_cpu.data() + b * stride_A); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, dim_A, batch_count, 1, stride_A, hB.data(), hA.data()); + unit_check_general( + 1, dim_A, batch_count, 1, stride_A, hA_cpu.data(), hA_host.data()); + unit_check_general( + 1, dim_A, batch_count, 1, stride_A, hA_cpu.data(), hA_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general( + 'F', 1, dim_A, 1, stride_A, hA_cpu.data(), hA_host.data(), batch_count); + hipblas_error_device = norm_check_general( + 'F', 1, dim_A, 1, stride_A, hA_cpu.data(), hA_device.data(), batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHpr2StridedBatchedFn(handle, + uplo, + N, + d_alpha, + dx, + incx, + stride_x, + dy, + incy, + stride_y, + dA, + stride_A, + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + hpr2_gflop_count(N), + hpr2_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hpr_batched.hpp b/clients/include/testing_hpr_batched.hpp index fe55bab49..35802c259 100644 --- a/clients/include/testing_hpr_batched.hpp +++ b/clients/include/testing_hpr_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -14,9 +14,10 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_hpr_batched(const Arguments& argus) { + using U = real_t; bool FORTRAN = argus.fortran; auto hipblasHprBatchedFn = FORTRAN ? hipblasHprBatched : hipblasHprBatched; @@ -29,13 +30,9 @@ hipblasStatus_t testing_hpr_batched(const Arguments& argus) int x_size = N * incx; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - U alpha = argus.get_alpha(); - - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + U h_alpha = argus.get_alpha(); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -48,89 +45,105 @@ hipblasStatus_t testing_hpr_batched(const Arguments& argus) return HIPBLAS_STATUS_SUCCESS; } - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory - host_vector hA[batch_count]; - host_vector hB[batch_count]; - host_vector hx[batch_count]; - - device_batch_vector bA(batch_count, A_size); - device_batch_vector bx(batch_count, x_size); + host_batch_vector hA(A_size, 1, batch_count); + host_batch_vector hA_cpu(A_size, 1, batch_count); + host_batch_vector hA_host(A_size, 1, batch_count); + host_batch_vector hA_device(A_size, 1, batch_count); + host_batch_vector hx(N, incx, batch_count); - device_vector dA(batch_count); - device_vector dx(batch_count); + device_batch_vector dA(A_size, 1, batch_count); + device_batch_vector dx(N, incx, batch_count); + device_vector d_alpha(1); - int last = batch_count - 1; - if(!dA || !dx || (!bA[last] && A_size) || (!bx[last] && x_size)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } + CHECK_HIP_ERROR(dA.memcheck()); + CHECK_HIP_ERROR(dx.memcheck()); // Initial Data on CPU - srand(1); - for(int b = 0; b < batch_count; b++) - { - hA[b] = host_vector(A_size); - hB[b] = host_vector(A_size); - hx[b] = host_vector(x_size); - - srand(1); - hipblas_init(hA[b], 1, A_size, 1); - hipblas_init(hx[b], 1, N, incx); - hB[b] = hA[b]; + hipblas_init(hA, true); + hipblas_init(hx); - CHECK_HIP_ERROR(hipMemcpy(bA[b], hA[b], sizeof(T) * A_size, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(bx[b], hx[b], sizeof(T) * x_size, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR(hipMemcpy(dA, bA, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dx, bx, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); + hA_cpu.copy_from(hA); + CHECK_HIP_ERROR(dA.transfer_from(hA)); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(U), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHprBatchedFn( + handle, uplo, N, (U*)&h_alpha, dx.ptr_on_device(), incx, dA.ptr_on_device(), batch_count)); - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHprBatchedFn(handle, uplo, N, (U*)&alpha, dx, incx, dA, batch_count); + CHECK_HIP_ERROR(hA_host.transfer_from(dA)); + CHECK_HIP_ERROR(dA.transfer_from(hA)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHprBatchedFn( + handle, uplo, N, d_alpha, dx.ptr_on_device(), incx, dA.ptr_on_device(), batch_count)); - // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - hipMemcpy(hA[b], bA[b], sizeof(T) * A_size, hipMemcpyDeviceToHost); - } + CHECK_HIP_ERROR(hA_device.transfer_from(dA)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_hpr(uplo, N, alpha, hx[b], incx, hB[b]); + cblas_hpr(uplo, N, h_alpha, hx[b], incx, hA_cpu[b]); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, A_size, batch_count, 1, hB, hA); + unit_check_general(1, A_size, batch_count, 1, hA_cpu, hA_host); + unit_check_general(1, A_size, batch_count, 1, hA_cpu, hA_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, A_size, 1, hA_cpu, hA_host, batch_count); + hipblas_error_device + = norm_check_general('F', 1, A_size, 1, hA_cpu, hA_device, batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(dA.transfer_from(hA)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHprBatchedFn(handle, + uplo, + N, + d_alpha, + dx.ptr_on_device(), + incx, + dA.ptr_on_device(), + batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + hpr_gflop_count(N), + hpr_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_hpr_strided_batched.hpp b/clients/include/testing_hpr_strided_batched.hpp index 2a0426c29..1a5ffe098 100644 --- a/clients/include/testing_hpr_strided_batched.hpp +++ b/clients/include/testing_hpr_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -14,9 +14,10 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_hpr_strided_batched(const Arguments& argus) { + using U = real_t; bool FORTRAN = argus.fortran; auto hipblasHprStridedBatchedFn = FORTRAN ? hipblasHprStridedBatched : hipblasHprStridedBatched; @@ -33,8 +34,6 @@ hipblasStatus_t testing_hpr_strided_batched(const Arguments& argus) int x_size = stride_x * batch_count; hipblasFillMode_t uplo = char2hipblas_fill(argus.uplo_option); - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N < 0 || incx == 0 || batch_count < 0) @@ -48,74 +47,106 @@ hipblasStatus_t testing_hpr_strided_batched(const Arguments& argus) // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); - host_vector hB(A_size); + host_vector hA_cpu(A_size); + host_vector hA_host(A_size); + host_vector hA_device(A_size); host_vector hx(x_size); device_vector dA(A_size); device_vector dx(x_size); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; - U alpha = argus.get_alpha(); + U h_alpha = argus.get_alpha(); - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); hipblas_init(hA, 1, dim_A, 1, stride_A, batch_count); hipblas_init(hx, 1, N, incx, stride_x, batch_count); - // copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS - hB = hA; + // copy matrix is easy in STL; hA_cpu = hA: save a copy in hA_cpu which will be output of CPU BLAS + hA_cpu = hA; // copy data from CPU to device - hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); - hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(U), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - if(argus.timing) - { - gpu_time_used = get_time_us(); // in microseconds - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); + CHECK_HIPBLAS_ERROR(hipblasHprStridedBatchedFn( + handle, uplo, N, (U*)&h_alpha, dx, incx, stride_x, dA, stride_A, batch_count)); - for(int iter = 0; iter < 1; iter++) - { - status = hipblasHprStridedBatchedFn( - handle, uplo, N, (U*)&alpha, dx, incx, stride_x, dA, stride_A, batch_count); + CHECK_HIP_ERROR(hipMemcpy(hA_host.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - } + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + CHECK_HIPBLAS_ERROR(hipblasHprStridedBatchedFn( + handle, uplo, N, d_alpha, dx, incx, stride_x, dA, stride_A, batch_count)); - // copy output from device to CPU - hipMemcpy(hA.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost); + CHECK_HIP_ERROR(hipMemcpy(hA_device.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_hpr(uplo, N, alpha, hx.data() + b * stride_x, incx, hB.data() + b * stride_A); + cblas_hpr( + uplo, N, h_alpha, hx.data() + b * stride_x, incx, hA_cpu.data() + b * stride_A); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, dim_A, batch_count, 1, stride_A, hB.data(), hA.data()); + unit_check_general( + 1, dim_A, batch_count, 1, stride_A, hA_cpu.data(), hA_host.data()); + unit_check_general( + 1, dim_A, batch_count, 1, stride_A, hA_cpu.data(), hA_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general( + 'F', 1, dim_A, 1, stride_A, hA_cpu.data(), hA_host.data(), batch_count); + hipblas_error_device = norm_check_general( + 'F', 1, dim_A, 1, stride_A, hA_cpu.data(), hA_device.data(), batch_count); + } + } + + if(argus.timing) + { + CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); + hipStream_t stream; + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + CHECK_HIPBLAS_ERROR(hipblasHprStridedBatchedFn( + handle, uplo, N, d_alpha, dx, incx, stride_x, dA, stride_A, batch_count)); } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + hpr_gflop_count(N), + hpr_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } From 4deab90893e7ab81192f9a728d3e2a4681bed041 Mon Sep 17 00:00:00 2001 From: Daine McNiven <51674140+daineAMD@users.noreply.github.com> Date: Fri, 26 Mar 2021 16:06:14 -0600 Subject: [PATCH 7/8] Removing unsupported tests on cuBLAS (#325) --- clients/gtest/axpy_ex_gtest.cpp | 14 +- clients/gtest/blas1_gtest.cpp | 175 ++++++++++++------ clients/gtest/dgmm_gtest.cpp | 14 +- clients/gtest/dot_ex_gtest.cpp | 22 ++- clients/gtest/gbmv_batched_gtest.cpp | 8 +- clients/gtest/gbmv_strided_batched_gtest.cpp | 10 +- clients/gtest/gemv_batched_gtest.cpp | 7 +- clients/gtest/gemv_strided_batched_gtest.cpp | 7 +- clients/gtest/geqrf_gtest.cpp | 14 +- clients/gtest/geqrf_strided_batched_gtest.cpp | 14 +- clients/gtest/ger_gtest.cpp | 18 +- clients/gtest/getrf_gtest.cpp | 14 +- clients/gtest/getrf_strided_batched_gtest.cpp | 14 +- clients/gtest/getrs_gtest.cpp | 14 +- clients/gtest/getrs_strided_batched_gtest.cpp | 14 +- clients/gtest/hbmv_gtest.cpp | 10 +- clients/gtest/hemm_gtest.cpp | 10 +- clients/gtest/hemv_batched_gtest.cpp | 6 +- clients/gtest/hemv_strided_batched_gtest.cpp | 7 +- clients/gtest/her2_gtest.cpp | 14 +- clients/gtest/her2k_gtest.cpp | 14 +- clients/gtest/her_gtest.cpp | 12 +- clients/gtest/herk_gtest.cpp | 14 +- clients/gtest/herkx_gtest.cpp | 14 +- clients/gtest/hpmv_gtest.cpp | 10 +- clients/gtest/hpr2_gtest.cpp | 14 +- clients/gtest/hpr_gtest.cpp | 12 +- clients/gtest/nrm2_ex_gtest.cpp | 10 +- clients/gtest/rot_ex_gtest.cpp | 10 +- clients/gtest/sbmv_gtest.cpp | 10 +- clients/gtest/scal_ex_gtest.cpp | 12 +- clients/gtest/spmv_gtest.cpp | 10 +- clients/gtest/spr2_gtest.cpp | 10 +- clients/gtest/spr_gtest.cpp | 16 +- clients/gtest/symm_gtest.cpp | 14 +- clients/gtest/symv_gtest.cpp | 10 +- clients/gtest/syr2_gtest.cpp | 14 +- clients/gtest/syr2k_gtest.cpp | 14 +- clients/gtest/syr_gtest.cpp | 14 +- clients/gtest/syrk_gtest.cpp | 14 +- clients/gtest/syrkx_gtest.cpp | 14 +- clients/gtest/tbmv_gtest.cpp | 10 +- clients/gtest/tbsv_gtest.cpp | 10 +- clients/gtest/tpmv_gtest.cpp | 10 +- clients/gtest/tpsv_gtest.cpp | 14 +- clients/gtest/trmm_gtest.cpp | 12 +- clients/gtest/trmv_gtest.cpp | 10 +- clients/gtest/trsm_ex_gtest.cpp | 18 +- clients/gtest/trsm_gtest.cpp | 10 +- clients/gtest/trsv_gtest.cpp | 14 +- clients/gtest/trtri_gtest.cpp | 12 +- clients/include/testing_her.hpp | 7 +- clients/include/testing_her2.hpp | 7 +- 53 files changed, 535 insertions(+), 267 deletions(-) diff --git a/clients/gtest/axpy_ex_gtest.cpp b/clients/gtest/axpy_ex_gtest.cpp index 36b587d3f..f1557fc12 100644 --- a/clients/gtest/axpy_ex_gtest.cpp +++ b/clients/gtest/axpy_ex_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * dotright 2016 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -74,9 +74,11 @@ const int batch_count_range[] = {-1, 0, 1, 2, 10}; // Supported rocBLAS configs const vector> precisions{ - // No cuBLAS support +// No cuBLAS support +#ifndef __HIP_PLATFORM_NVCC__ {HIPBLAS_R_16F, HIPBLAS_R_16F, HIPBLAS_R_16F, HIPBLAS_R_16F}, {HIPBLAS_R_16F, HIPBLAS_R_16F, HIPBLAS_R_16F, HIPBLAS_R_32F}, +#endif {HIPBLAS_R_32F, HIPBLAS_R_16F, HIPBLAS_R_16F, HIPBLAS_R_32F}, {HIPBLAS_R_32F, HIPBLAS_R_32F, HIPBLAS_R_32F, HIPBLAS_R_32F}, @@ -144,6 +146,8 @@ TEST_P(axpy_ex_gtest, axpy_ex) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(axpy_ex_gtest, axpy_batched_ex) { Arguments arg = setup_axpy_ex_arguments(GetParam()); @@ -157,7 +161,7 @@ TEST_P(axpy_ex_gtest, axpy_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for CUDA + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -175,11 +179,13 @@ TEST_P(axpy_ex_gtest, axpy_strided_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for CUDA + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // Values is for a single item; ValuesIn is for an array // notice we are using vector of vector // so each elment in xxx_range is a avector, diff --git a/clients/gtest/blas1_gtest.cpp b/clients/gtest/blas1_gtest.cpp index 1225da1e3..a72b2aa13 100644 --- a/clients/gtest/blas1_gtest.cpp +++ b/clients/gtest/blas1_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * dotright 2016 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -157,6 +157,8 @@ Arguments setup_blas1_arguments(blas1_tuple tup) } // axpy + +#ifndef __HIP_PLATFORM_NVCC__ TEST_P(blas1_gtest, axpy_half) { Arguments arg = setup_blas1_arguments(GetParam()); @@ -174,10 +176,11 @@ TEST_P(blas1_gtest, axpy_half) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif TEST_P(blas1_gtest, axpy_float) { @@ -223,6 +226,8 @@ TEST_P(blas1_gtest, axpy_float_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // axpy_batched TEST_P(blas1_gtest, axpy_batched_float) { @@ -245,7 +250,7 @@ TEST_P(blas1_gtest, axpy_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -271,7 +276,7 @@ TEST_P(blas1_gtest, axpy_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -298,7 +303,7 @@ TEST_P(blas1_gtest, axpy_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -324,11 +329,13 @@ TEST_P(blas1_gtest, axpy_strided_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // copy tests TEST_P(blas1_gtest, copy_float) { @@ -374,6 +381,8 @@ TEST_P(blas1_gtest, copy_float_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // copy_batched tests TEST_P(blas1_gtest, copy_batched_float) { @@ -396,7 +405,7 @@ TEST_P(blas1_gtest, copy_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -422,7 +431,7 @@ TEST_P(blas1_gtest, copy_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -449,7 +458,7 @@ TEST_P(blas1_gtest, copy_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -475,11 +484,13 @@ TEST_P(blas1_gtest, copy_strided_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // scal tests TEST_P(blas1_gtest, scal_float) { @@ -560,6 +571,8 @@ TEST_P(blas1_gtest, scal_float_complex_float) } } +#ifndef __HIP_PLATFORM_NVCC__ + // scal_batched tests TEST_P(blas1_gtest, scal_batched_float) { @@ -748,6 +761,8 @@ TEST_P(blas1_gtest, scal_strided_batched_float_complex_float) } } +#endif + // swap tests TEST_P(blas1_gtest, swap_float) { @@ -793,6 +808,8 @@ TEST_P(blas1_gtest, swap_float_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // swap_batched tests TEST_P(blas1_gtest, swap_batched_float) { @@ -815,7 +832,7 @@ TEST_P(blas1_gtest, swap_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -841,7 +858,7 @@ TEST_P(blas1_gtest, swap_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -868,7 +885,7 @@ TEST_P(blas1_gtest, swap_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -894,12 +911,15 @@ TEST_P(blas1_gtest, swap_strided_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } +#endif + // dot tests +#ifndef __HIP_PLATFORM_NVCC__ TEST_P(blas1_gtest, dot_half) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -926,7 +946,7 @@ TEST_P(blas1_gtest, dot_half) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -957,10 +977,11 @@ TEST_P(blas1_gtest, dot_bfloat16) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } +#endif TEST_P(blas1_gtest, dot_float) { @@ -987,7 +1008,7 @@ TEST_P(blas1_gtest, dot_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -1044,6 +1065,8 @@ TEST_P(blas1_gtest, dotc_float_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // dot_batched tests TEST_P(blas1_gtest, dot_batched_half) { @@ -1075,7 +1098,7 @@ TEST_P(blas1_gtest, dot_batched_half) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -1110,7 +1133,7 @@ TEST_P(blas1_gtest, dot_batched_bfloat16) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -1144,7 +1167,7 @@ TEST_P(blas1_gtest, dot_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -1174,7 +1197,7 @@ TEST_P(blas1_gtest, dotu_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -1204,7 +1227,7 @@ TEST_P(blas1_gtest, dotc_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -1240,7 +1263,7 @@ TEST_P(blas1_gtest, dot_strided_batched_half) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -1275,7 +1298,7 @@ TEST_P(blas1_gtest, dot_strided_batched_bfloat16) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -1309,7 +1332,7 @@ TEST_P(blas1_gtest, dot_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -1339,7 +1362,7 @@ TEST_P(blas1_gtest, dotu_strided_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -1369,11 +1392,13 @@ TEST_P(blas1_gtest, dotc_strided_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } +#endif + // nrm2 tests TEST_P(blas1_gtest, nrm2_float) { @@ -1427,6 +1452,8 @@ TEST_P(blas1_gtest, nrm2_float_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // nrm2_batched tests TEST_P(blas1_gtest, nrm2_batched_float) { @@ -1453,7 +1480,7 @@ TEST_P(blas1_gtest, nrm2_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -1483,7 +1510,7 @@ TEST_P(blas1_gtest, nrm2_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1514,7 +1541,7 @@ TEST_P(blas1_gtest, nrm2_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1544,11 +1571,13 @@ TEST_P(blas1_gtest, nrm2_strided_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // rot TEST_P(blas1_gtest, rot_float) { @@ -1595,6 +1624,8 @@ TEST_P(blas1_gtest, rot_float_complex_float) } } +#ifndef __HIP_PLATFORM_NVCC__ + // rot_batched TEST_P(blas1_gtest, rot_batched_float) { @@ -1613,7 +1644,7 @@ TEST_P(blas1_gtest, rot_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1635,7 +1666,7 @@ TEST_P(blas1_gtest, rot_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1657,7 +1688,7 @@ TEST_P(blas1_gtest, rot_batched_float_complex_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1680,7 +1711,7 @@ TEST_P(blas1_gtest, rot_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1702,7 +1733,7 @@ TEST_P(blas1_gtest, rot_strided_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1724,11 +1755,13 @@ TEST_P(blas1_gtest, rot_strided_batched_float_complex_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // rotg TEST_P(blas1_gtest, rotg_float) { @@ -1752,6 +1785,8 @@ TEST_P(blas1_gtest, rotg_float_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // rotg_batched TEST_P(blas1_gtest, rotg_batched_float) { @@ -1766,7 +1801,7 @@ TEST_P(blas1_gtest, rotg_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1784,7 +1819,7 @@ TEST_P(blas1_gtest, rotg_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1803,7 +1838,7 @@ TEST_P(blas1_gtest, rotg_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1821,11 +1856,13 @@ TEST_P(blas1_gtest, rotg_strided_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // rotm TEST_P(blas1_gtest, rotm_float) { @@ -1838,6 +1875,8 @@ TEST_P(blas1_gtest, rotm_float) } } +#ifndef __HIP_PLATFORM_NVCC__ + // rotm_batched TEST_P(blas1_gtest, rotm_batched_float) { @@ -1852,7 +1891,7 @@ TEST_P(blas1_gtest, rotm_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1871,11 +1910,13 @@ TEST_P(blas1_gtest, rotm_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // rotmg TEST_P(blas1_gtest, rotmg_float) { @@ -1888,6 +1929,8 @@ TEST_P(blas1_gtest, rotmg_float) } } +#ifndef __HIP_PLATFORM_NVCC__ + // rotmg_batched TEST_P(blas1_gtest, rotmg_batched_float) { @@ -1902,7 +1945,7 @@ TEST_P(blas1_gtest, rotmg_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -1921,11 +1964,13 @@ TEST_P(blas1_gtest, rotmg_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // asum TEST_P(blas1_gtest, asum_float) { @@ -1997,6 +2042,8 @@ TEST_P(blas1_gtest, asum_double_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // asum_batched TEST_P(blas1_gtest, asum_batched_float) { @@ -2019,7 +2066,7 @@ TEST_P(blas1_gtest, asum_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -2045,7 +2092,7 @@ TEST_P(blas1_gtest, asum_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -2071,7 +2118,7 @@ TEST_P(blas1_gtest, asum_batched_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -2098,7 +2145,7 @@ TEST_P(blas1_gtest, asum_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -2124,7 +2171,7 @@ TEST_P(blas1_gtest, asum_strided_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -2150,11 +2197,13 @@ TEST_P(blas1_gtest, asum_strided_batched_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // amax TEST_P(blas1_gtest, amax_float) { @@ -2184,6 +2233,8 @@ TEST_P(blas1_gtest, amax_float_complex) EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } +#ifndef __HIP_PLATFORM_NVCC__ + // amax_batched TEST_P(blas1_gtest, amax_batched_float) { @@ -2198,7 +2249,7 @@ TEST_P(blas1_gtest, amax_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -2216,7 +2267,7 @@ TEST_P(blas1_gtest, amax_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -2235,7 +2286,7 @@ TEST_P(blas1_gtest, amax_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -2253,11 +2304,13 @@ TEST_P(blas1_gtest, amax_strided_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // amin TEST_P(blas1_gtest, amin_float) { @@ -2277,6 +2330,8 @@ TEST_P(blas1_gtest, amin_float_complex) EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } +#ifndef __HIP_PLATFORM_NVCC__ + // amin_batched TEST_P(blas1_gtest, amin_batched_float) { @@ -2291,7 +2346,7 @@ TEST_P(blas1_gtest, amin_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -2309,7 +2364,7 @@ TEST_P(blas1_gtest, amin_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -2328,7 +2383,7 @@ TEST_P(blas1_gtest, amin_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -2346,11 +2401,13 @@ TEST_P(blas1_gtest, amin_strided_batched_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // Values is for a single item; ValuesIn is for an array // notice we are using vector of vector // so each elment in xxx_range is a avector, diff --git a/clients/gtest/dgmm_gtest.cpp b/clients/gtest/dgmm_gtest.cpp index a85b6f989..c823f633b 100644 --- a/clients/gtest/dgmm_gtest.cpp +++ b/clients/gtest/dgmm_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -165,6 +165,8 @@ TEST_P(dgmm_gtest, dgmm_gtest_float_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(dgmm_gtest, dgmm_batched_gtest_float) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -186,7 +188,7 @@ TEST_P(dgmm_gtest, dgmm_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -212,7 +214,7 @@ TEST_P(dgmm_gtest, dgmm_batched_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -238,7 +240,7 @@ TEST_P(dgmm_gtest, dgmm_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -264,11 +266,13 @@ TEST_P(dgmm_gtest, dgmm_strided_batched_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/dot_ex_gtest.cpp b/clients/gtest/dot_ex_gtest.cpp index 0fef7150a..4c9b1e0c4 100644 --- a/clients/gtest/dot_ex_gtest.cpp +++ b/clients/gtest/dot_ex_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * dotright 2016 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -68,9 +68,11 @@ const double stride_scale_range[] = {1.0, 2.5}; const int batch_count_range[] = {-1, 0, 1, 2, 10}; const vector> precisions{ - // Not supported in cuBLAS +// Not supported in cuBLAS +#ifndef __HIP_PLATFORM_NVCC__ {HIPBLAS_R_16B, HIPBLAS_R_16B, HIPBLAS_R_16B, HIPBLAS_R_32F}, {HIPBLAS_R_16F, HIPBLAS_R_16F, HIPBLAS_R_16F, HIPBLAS_R_16F}, +#endif // Supported in both rocBLAS and cuBLAS {HIPBLAS_R_16F, HIPBLAS_R_16F, HIPBLAS_R_16F, HIPBLAS_R_32F}, @@ -135,11 +137,13 @@ TEST_P(dot_ex_gtest, dot_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(dot_ex_gtest, dot_batched_ex) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -157,7 +161,7 @@ TEST_P(dot_ex_gtest, dot_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -179,7 +183,7 @@ TEST_P(dot_ex_gtest, dot_strided_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -202,7 +206,7 @@ TEST_P(dot_ex_gtest, dotc_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -224,7 +228,7 @@ TEST_P(dot_ex_gtest, dotc_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -246,11 +250,13 @@ TEST_P(dot_ex_gtest, dotc_strided_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // Values is for a single item; ValuesIn is for an array // notice we are using vector of vector // so each elment in xxx_range is a avector, diff --git a/clients/gtest/gbmv_batched_gtest.cpp b/clients/gtest/gbmv_batched_gtest.cpp index 37c5eeb4b..85268c3cc 100644 --- a/clients/gtest/gbmv_batched_gtest.cpp +++ b/clients/gtest/gbmv_batched_gtest.cpp @@ -144,6 +144,8 @@ class gbmv_gtest_batched : public ::TestWithParam virtual void TearDown() {} }; +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(gbmv_gtest_batched, gbmv_gtest_float) { Arguments arg = setup_gbmv_arguments(GetParam()); @@ -160,7 +162,7 @@ TEST_P(gbmv_gtest_batched, gbmv_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -181,11 +183,13 @@ TEST_P(gbmv_gtest_batched, gbmv_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/gbmv_strided_batched_gtest.cpp b/clients/gtest/gbmv_strided_batched_gtest.cpp index b0b8fd447..4ff157a08 100644 --- a/clients/gtest/gbmv_strided_batched_gtest.cpp +++ b/clients/gtest/gbmv_strided_batched_gtest.cpp @@ -155,6 +155,8 @@ class gbmv_gtest_strided_batched : public ::TestWithParam virtual void TearDown() {} }; +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(gbmv_gtest_strided_batched, gbmv_gtest_float) { Arguments arg = setup_gbmv_arguments(GetParam()); @@ -172,8 +174,7 @@ TEST_P(gbmv_gtest_strided_batched, gbmv_gtest_float) } else { - // for cuda - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -194,12 +195,13 @@ TEST_P(gbmv_gtest_strided_batched, gbmv_gtest_float_complex) } else { - // for cuda - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/gemv_batched_gtest.cpp b/clients/gtest/gemv_batched_gtest.cpp index debf6bc15..fd7eb5a2a 100644 --- a/clients/gtest/gemv_batched_gtest.cpp +++ b/clients/gtest/gemv_batched_gtest.cpp @@ -142,6 +142,8 @@ class gemv_gtest_batched : public ::TestWithParam virtual void TearDown() {} }; +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(gemv_gtest_batched, gemv_gtest_float) { Arguments arg = setup_gemv_arguments(GetParam()); @@ -205,12 +207,13 @@ TEST_P(gemv_gtest_batched, gemv_gtest_float_complex) } else { - // for cuda - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/gemv_strided_batched_gtest.cpp b/clients/gtest/gemv_strided_batched_gtest.cpp index b0b5d26d9..e309753e5 100644 --- a/clients/gtest/gemv_strided_batched_gtest.cpp +++ b/clients/gtest/gemv_strided_batched_gtest.cpp @@ -153,6 +153,8 @@ class gemv_gtest_strided_batched : public ::TestWithParam virtual void TearDown() {} }; +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(gemv_gtest_strided_batched, gemv_gtest_float) { Arguments arg = setup_gemv_arguments(GetParam()); @@ -185,8 +187,7 @@ TEST_P(gemv_gtest_strided_batched, gemv_gtest_float) } else { - // for cuda - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } @@ -224,6 +225,8 @@ TEST_P(gemv_gtest_strided_batched, gemv_gtest_float_complex) } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/geqrf_gtest.cpp b/clients/gtest/geqrf_gtest.cpp index 4e2f1f386..6cabc63eb 100644 --- a/clients/gtest/geqrf_gtest.cpp +++ b/clients/gtest/geqrf_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -58,6 +58,8 @@ class geqrf_gtest : public ::TestWithParam virtual void TearDown() {} }; +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(geqrf_gtest, geqrf_gtest_float) { // GetParam returns a tuple. The setup routine unpacks the tuple @@ -75,7 +77,7 @@ TEST_P(geqrf_gtest, geqrf_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -97,7 +99,7 @@ TEST_P(geqrf_gtest, geqrf_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -119,7 +121,7 @@ TEST_P(geqrf_gtest, geqrf_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -141,11 +143,13 @@ TEST_P(geqrf_gtest, geqrf_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a vector, // ValuesIn takes each element (a vector), combines them, and feeds them to test_p diff --git a/clients/gtest/geqrf_strided_batched_gtest.cpp b/clients/gtest/geqrf_strided_batched_gtest.cpp index 2909fa57a..4b3cf6c06 100644 --- a/clients/gtest/geqrf_strided_batched_gtest.cpp +++ b/clients/gtest/geqrf_strided_batched_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -58,6 +58,8 @@ class geqrf_strided_batched_gtest : public ::TestWithParam virtual void TearDown() {} }; +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(getrf_gtest, getrf_gtest_float) { // GetParam returns a tuple. The setup routine unpacks the tuple @@ -78,7 +80,7 @@ TEST_P(getrf_gtest, getrf_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -100,7 +102,7 @@ TEST_P(getrf_gtest, getrf_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -122,7 +124,7 @@ TEST_P(getrf_gtest, getrf_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -144,11 +146,13 @@ TEST_P(getrf_gtest, getrf_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a vector, // ValuesIn takes each element (a vector), combines them, and feeds them to test_p diff --git a/clients/gtest/getrf_strided_batched_gtest.cpp b/clients/gtest/getrf_strided_batched_gtest.cpp index 4fa71c8ac..013e8b09b 100644 --- a/clients/gtest/getrf_strided_batched_gtest.cpp +++ b/clients/gtest/getrf_strided_batched_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -61,6 +61,8 @@ class getrf_strided_batched_gtest : public ::TestWithParam virtual void TearDown() {} }; +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(getrs_gtest, getrs_gtest_float) { // GetParam returns a tuple. The setup routine unpacks the tuple @@ -74,7 +76,7 @@ TEST_P(getrs_gtest, getrs_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -96,7 +98,7 @@ TEST_P(getrs_gtest, getrs_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -118,7 +120,7 @@ TEST_P(getrs_gtest, getrs_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -140,11 +142,13 @@ TEST_P(getrs_gtest, getrs_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a vector, // ValuesIn takes each element (a vector), combines them, and feeds them to test_p diff --git a/clients/gtest/getrs_strided_batched_gtest.cpp b/clients/gtest/getrs_strided_batched_gtest.cpp index aa972988c..bbf43462c 100644 --- a/clients/gtest/getrs_strided_batched_gtest.cpp +++ b/clients/gtest/getrs_strided_batched_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -57,6 +57,8 @@ class getrs_strided_batched_gtest : public ::TestWithParam virtual void TearDown() {} }; +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(hemv_gtest_batched, hemv_gtest_float_complex) { Arguments arg = setup_hemv_arguments(GetParam()); @@ -157,11 +159,13 @@ TEST_P(hemv_gtest_batched, hemv_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/hemv_strided_batched_gtest.cpp b/clients/gtest/hemv_strided_batched_gtest.cpp index 048431873..3253f51b3 100644 --- a/clients/gtest/hemv_strided_batched_gtest.cpp +++ b/clients/gtest/hemv_strided_batched_gtest.cpp @@ -152,6 +152,8 @@ class hemv_gtest_strided_batched : public ::TestWithParam virtual void TearDown() {} }; +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(hemv_gtest_strided_batched, hemv_gtest_float_complex) { Arguments arg = setup_hemv_arguments(GetParam()); @@ -168,12 +170,13 @@ TEST_P(hemv_gtest_strided_batched, hemv_gtest_float_complex) } else { - // for cuda - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/her2_gtest.cpp b/clients/gtest/her2_gtest.cpp index 000f13e73..02042674f 100644 --- a/clients/gtest/her2_gtest.cpp +++ b/clients/gtest/her2_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -174,6 +174,8 @@ TEST_P(blas2_her2_gtest, her2_gtest_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + // her2_batched TEST_P(blas2_her2_gtest, her2_batched_gtest_float) { @@ -195,7 +197,7 @@ TEST_P(blas2_her2_gtest, her2_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -220,7 +222,7 @@ TEST_P(blas2_her2_gtest, her2_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -246,7 +248,7 @@ TEST_P(blas2_her2_gtest, her2_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -271,11 +273,13 @@ TEST_P(blas2_her2_gtest, her2_strided_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/her2k_gtest.cpp b/clients/gtest/her2k_gtest.cpp index 5bfb23f1e..261207e9d 100644 --- a/clients/gtest/her2k_gtest.cpp +++ b/clients/gtest/her2k_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -183,6 +183,8 @@ TEST_P(blas2_her2k_gtest, her2k_gtest_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + // her2k_batched TEST_P(blas2_her2k_gtest, her2k_batched_gtest_float) { @@ -207,7 +209,7 @@ TEST_P(blas2_her2k_gtest, her2k_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -235,7 +237,7 @@ TEST_P(blas2_her2k_gtest, her2k_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -264,7 +266,7 @@ TEST_P(blas2_her2k_gtest, her2k_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -292,11 +294,13 @@ TEST_P(blas2_her2k_gtest, her2k_strided_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/her_gtest.cpp b/clients/gtest/her_gtest.cpp index ca4d56f60..a604d3ee8 100644 --- a/clients/gtest/her_gtest.cpp +++ b/clients/gtest/her_gtest.cpp @@ -172,6 +172,8 @@ TEST_P(blas2_her_gtest, her_gtest_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + // her_batched TEST_P(blas2_her_gtest, her_batched_gtest_float) { @@ -193,7 +195,7 @@ TEST_P(blas2_her_gtest, her_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -218,7 +220,7 @@ TEST_P(blas2_her_gtest, her_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -244,7 +246,7 @@ TEST_P(blas2_her_gtest, her_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -269,11 +271,13 @@ TEST_P(blas2_her_gtest, her_strided_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/herk_gtest.cpp b/clients/gtest/herk_gtest.cpp index d12f81a17..395b084db 100644 --- a/clients/gtest/herk_gtest.cpp +++ b/clients/gtest/herk_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -179,6 +179,8 @@ TEST_P(blas2_herk_gtest, herk_gtest_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + // herk_batched TEST_P(blas2_herk_gtest, herk_batched_gtest_float) { @@ -202,7 +204,7 @@ TEST_P(blas2_herk_gtest, herk_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -229,7 +231,7 @@ TEST_P(blas2_herk_gtest, herk_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -257,7 +259,7 @@ TEST_P(blas2_herk_gtest, herk_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -284,11 +286,13 @@ TEST_P(blas2_herk_gtest, herk_strided_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/herkx_gtest.cpp b/clients/gtest/herkx_gtest.cpp index c4cc083fe..b555e7897 100644 --- a/clients/gtest/herkx_gtest.cpp +++ b/clients/gtest/herkx_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -185,6 +185,8 @@ TEST_P(blas2_herkx_gtest, herkx_gtest_double) // } } +#ifndef __HIP_PLATFORM_NVCC__ + // herkx_batched TEST_P(blas2_herkx_gtest, herkx_batched_gtest_float) { @@ -209,7 +211,7 @@ TEST_P(blas2_herkx_gtest, herkx_batched_gtest_float) // } // else // { - // EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + // EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail // } // } } @@ -237,7 +239,7 @@ TEST_P(blas2_herkx_gtest, herkx_batched_gtest_double) // } // else // { - // EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + // EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail // } // } } @@ -266,7 +268,7 @@ TEST_P(blas2_herkx_gtest, herkx_strided_batched_gtest_float) // } // else // { - // EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + // EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail // } // } } @@ -294,11 +296,13 @@ TEST_P(blas2_herkx_gtest, herkx_strided_batched_gtest_double) // } // else // { - // EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + // EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail // } // } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/hpmv_gtest.cpp b/clients/gtest/hpmv_gtest.cpp index 38b617c78..ab4dc9fcd 100644 --- a/clients/gtest/hpmv_gtest.cpp +++ b/clients/gtest/hpmv_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -165,6 +165,8 @@ TEST_P(hpmv_gtest, hpmv_gtest_float_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(hpmv_gtest, hpmv_batched_gtest_float_complex) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -185,7 +187,7 @@ TEST_P(hpmv_gtest, hpmv_batched_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -210,11 +212,13 @@ TEST_P(hpmv_gtest, hpmv_strided_batched_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/hpr2_gtest.cpp b/clients/gtest/hpr2_gtest.cpp index fbf601461..1f85613bd 100644 --- a/clients/gtest/hpr2_gtest.cpp +++ b/clients/gtest/hpr2_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -173,6 +173,8 @@ TEST_P(blas2_hpr2_gtest, hpr2_gtest_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + // hpr2_batched TEST_P(blas2_hpr2_gtest, hpr2_batched_gtest_float) { @@ -194,7 +196,7 @@ TEST_P(blas2_hpr2_gtest, hpr2_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -219,7 +221,7 @@ TEST_P(blas2_hpr2_gtest, hpr2_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -245,7 +247,7 @@ TEST_P(blas2_hpr2_gtest, hpr2_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -270,11 +272,13 @@ TEST_P(blas2_hpr2_gtest, hpr2_strided_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/hpr_gtest.cpp b/clients/gtest/hpr_gtest.cpp index b7ecb54da..701d96f9c 100644 --- a/clients/gtest/hpr_gtest.cpp +++ b/clients/gtest/hpr_gtest.cpp @@ -154,6 +154,8 @@ TEST_P(blas2_hpr_gtest, hpr_gtest_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + // hpr_batched TEST_P(blas2_hpr_gtest, hpr_batched_gtest_float) { @@ -175,7 +177,7 @@ TEST_P(blas2_hpr_gtest, hpr_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -200,7 +202,7 @@ TEST_P(blas2_hpr_gtest, hpr_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -226,7 +228,7 @@ TEST_P(blas2_hpr_gtest, hpr_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -251,11 +253,13 @@ TEST_P(blas2_hpr_gtest, hpr_strided_batched_gtest_double) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/nrm2_ex_gtest.cpp b/clients/gtest/nrm2_ex_gtest.cpp index f8c132824..c7763840a 100644 --- a/clients/gtest/nrm2_ex_gtest.cpp +++ b/clients/gtest/nrm2_ex_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * dotright 2016 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -125,6 +125,8 @@ TEST_P(nrm2_ex_gtest, nrm2_ex) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(nrm2_ex_gtest, nrm2_batched_ex) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -142,7 +144,7 @@ TEST_P(nrm2_ex_gtest, nrm2_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for CUDA + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -164,11 +166,13 @@ TEST_P(nrm2_ex_gtest, nrm2_strided_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for CUDA + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // Values is for a single item; ValuesIn is for an array // notice we are using vector of vector // so each elment in xxx_range is a avector, diff --git a/clients/gtest/rot_ex_gtest.cpp b/clients/gtest/rot_ex_gtest.cpp index 227da0be2..797800af8 100644 --- a/clients/gtest/rot_ex_gtest.cpp +++ b/clients/gtest/rot_ex_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * dotright 2016 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -131,6 +131,8 @@ TEST_P(rot_ex_gtest, rot_ex) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(rot_ex_gtest, rot_batched_ex) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -148,7 +150,7 @@ TEST_P(rot_ex_gtest, rot_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for CUDA + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -170,11 +172,13 @@ TEST_P(rot_ex_gtest, rot_strided_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for CUDA + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // Values is for a single item; ValuesIn is for an array // notice we are using vector of vector // so each elment in xxx_range is a avector, diff --git a/clients/gtest/sbmv_gtest.cpp b/clients/gtest/sbmv_gtest.cpp index d2908c30e..8f4ec3b43 100644 --- a/clients/gtest/sbmv_gtest.cpp +++ b/clients/gtest/sbmv_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -179,6 +179,8 @@ TEST_P(blas2_sbmv_gtest, sbmv_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(blas2_sbmv_gtest, sbmv_batched_float) { Arguments arg = setup_sbmv_arguments(GetParam()); @@ -194,7 +196,7 @@ TEST_P(blas2_sbmv_gtest, sbmv_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -214,11 +216,13 @@ TEST_P(blas2_sbmv_gtest, sbmv_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/scal_ex_gtest.cpp b/clients/gtest/scal_ex_gtest.cpp index 0593f2e25..a2fbd2529 100644 --- a/clients/gtest/scal_ex_gtest.cpp +++ b/clients/gtest/scal_ex_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * dotright 2016 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -64,10 +64,12 @@ const int batch_count_range[] = {-1, 0, 1, 2, 10}; // Supported rocBLAS configs const vector> precisions{// Not supported in cuBLAS +#ifndef __HIP_PLATFORM_NVCC__ {HIPBLAS_R_16F, HIPBLAS_R_16F, HIPBLAS_R_16F}, {HIPBLAS_R_16F, HIPBLAS_R_16F, HIPBLAS_R_32F}, {HIPBLAS_R_32F, HIPBLAS_C_32F, HIPBLAS_C_32F}, {HIPBLAS_R_64F, HIPBLAS_C_64F, HIPBLAS_C_64F}, +#endif // Supported in both rocBLAS and cuBLAS {HIPBLAS_R_32F, HIPBLAS_R_16F, HIPBLAS_R_32F}, @@ -141,6 +143,8 @@ TEST_P(scal_ex_gtest, scal_ex) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(scal_ex_gtest, scal_batched_ex) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -158,7 +162,7 @@ TEST_P(scal_ex_gtest, scal_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for CUDA + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -180,11 +184,13 @@ TEST_P(scal_ex_gtest, scal_strided_batched_ex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for CUDA + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // Values is for a single item; ValuesIn is for an array // notice we are using vector of vector // so each elment in xxx_range is a avector, diff --git a/clients/gtest/spmv_gtest.cpp b/clients/gtest/spmv_gtest.cpp index f58cf3ac6..9605e5a7c 100644 --- a/clients/gtest/spmv_gtest.cpp +++ b/clients/gtest/spmv_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -177,6 +177,8 @@ TEST_P(blas2_spmv_gtest, spmv_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(blas2_spmv_gtest, spmv_batched_float) { Arguments arg = setup_spmv_arguments(GetParam()); @@ -192,7 +194,7 @@ TEST_P(blas2_spmv_gtest, spmv_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -212,11 +214,13 @@ TEST_P(blas2_spmv_gtest, spmv_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/spr2_gtest.cpp b/clients/gtest/spr2_gtest.cpp index b7ecb9e43..d464d9a02 100644 --- a/clients/gtest/spr2_gtest.cpp +++ b/clients/gtest/spr2_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -140,6 +140,8 @@ TEST_P(blas2_spr2_gtest, spr2_gtest_float) } } +#ifndef __HIP_PLATFORM_NVCC__ + // spr2_batched TEST_P(blas2_spr2_gtest, spr2_batched_gtest_float) { @@ -161,7 +163,7 @@ TEST_P(blas2_spr2_gtest, spr2_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -187,11 +189,13 @@ TEST_P(blas2_spr2_gtest, spr2_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/spr_gtest.cpp b/clients/gtest/spr_gtest.cpp index 6b1d75254..cb6b7a704 100644 --- a/clients/gtest/spr_gtest.cpp +++ b/clients/gtest/spr_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -128,6 +128,8 @@ TEST_P(blas2_spr_gtest, spr_gtest_float) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(blas2_spr_gtest, spr_gtest_float_complex) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -148,7 +150,7 @@ TEST_P(blas2_spr_gtest, spr_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -174,7 +176,7 @@ TEST_P(blas2_spr_gtest, spr_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -199,7 +201,7 @@ TEST_P(blas2_spr_gtest, spr_batched_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -225,7 +227,7 @@ TEST_P(blas2_spr_gtest, spr_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -250,11 +252,13 @@ TEST_P(blas2_spr_gtest, spr_strided_batched_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/symm_gtest.cpp b/clients/gtest/symm_gtest.cpp index 0c2044ac9..2af0c9738 100644 --- a/clients/gtest/symm_gtest.cpp +++ b/clients/gtest/symm_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -171,6 +171,8 @@ TEST_P(symm_gtest, symm_gtest_float) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(symm_gtest, symm_gtest_double_complex) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -220,7 +222,7 @@ TEST_P(symm_gtest, symm_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -247,7 +249,7 @@ TEST_P(symm_gtest, symm_batched_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -274,7 +276,7 @@ TEST_P(symm_gtest, symm_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -301,11 +303,13 @@ TEST_P(symm_gtest, symm_strided_batched_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/symv_gtest.cpp b/clients/gtest/symv_gtest.cpp index 0e26defc9..768b2d5bb 100644 --- a/clients/gtest/symv_gtest.cpp +++ b/clients/gtest/symv_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -178,6 +178,8 @@ TEST_P(blas2_symv_gtest, symv_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(blas2_symv_gtest, symv_batched_float) { Arguments arg = setup_symv_arguments(GetParam()); @@ -193,7 +195,7 @@ TEST_P(blas2_symv_gtest, symv_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -213,11 +215,13 @@ TEST_P(blas2_symv_gtest, symv_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/syr2_gtest.cpp b/clients/gtest/syr2_gtest.cpp index efea70685..0f6f81a94 100644 --- a/clients/gtest/syr2_gtest.cpp +++ b/clients/gtest/syr2_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -166,6 +166,8 @@ TEST_P(blas2_syr2_gtest, syr2_gtest_float_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // syr2_batched TEST_P(blas2_syr2_gtest, syr2_batched_gtest_float) { @@ -187,7 +189,7 @@ TEST_P(blas2_syr2_gtest, syr2_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -212,7 +214,7 @@ TEST_P(blas2_syr2_gtest, syr2_batched_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -238,7 +240,7 @@ TEST_P(blas2_syr2_gtest, syr2_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -263,11 +265,13 @@ TEST_P(blas2_syr2_gtest, syr2_strided_batched_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/syr2k_gtest.cpp b/clients/gtest/syr2k_gtest.cpp index 3da0f738a..e4ab16deb 100644 --- a/clients/gtest/syr2k_gtest.cpp +++ b/clients/gtest/syr2k_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -183,6 +183,8 @@ TEST_P(blas2_syrkx_gtest, syrkx_gtest_double_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // syrkx_batched TEST_P(blas2_syrkx_gtest, syrkx_batched_gtest_float) { @@ -207,7 +209,7 @@ TEST_P(blas2_syrkx_gtest, syrkx_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -235,7 +237,7 @@ TEST_P(blas2_syrkx_gtest, syrkx_batched_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -264,7 +266,7 @@ TEST_P(blas2_syrkx_gtest, syrkx_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -292,11 +294,13 @@ TEST_P(blas2_syrkx_gtest, syrkx_strided_batched_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/syr_gtest.cpp b/clients/gtest/syr_gtest.cpp index 259c23b89..4370f925b 100644 --- a/clients/gtest/syr_gtest.cpp +++ b/clients/gtest/syr_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -174,6 +174,8 @@ TEST_P(blas2_syr_gtest, syr_gtest_float_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // syr_batched TEST_P(blas2_syr_gtest, syr_batched_gtest_float) { @@ -196,7 +198,7 @@ TEST_P(blas2_syr_gtest, syr_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -222,7 +224,7 @@ TEST_P(blas2_syr_gtest, syr_batched_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -249,7 +251,7 @@ TEST_P(blas2_syr_gtest, syr_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -275,11 +277,13 @@ TEST_P(blas2_syr_gtest, syr_strided_batched_gtest_float_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/syrk_gtest.cpp b/clients/gtest/syrk_gtest.cpp index 34b1203b3..c4ed77280 100644 --- a/clients/gtest/syrk_gtest.cpp +++ b/clients/gtest/syrk_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -181,6 +181,8 @@ TEST_P(blas2_syrk_gtest, syrk_gtest_double_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // syrk_batched TEST_P(blas2_syrk_gtest, syrk_batched_gtest_float) { @@ -206,7 +208,7 @@ TEST_P(blas2_syrk_gtest, syrk_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -233,7 +235,7 @@ TEST_P(blas2_syrk_gtest, syrk_batched_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -263,7 +265,7 @@ TEST_P(blas2_syrk_gtest, syrk_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -290,11 +292,13 @@ TEST_P(blas2_syrk_gtest, syrk_strided_batched_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/syrkx_gtest.cpp b/clients/gtest/syrkx_gtest.cpp index f171f94b0..0e57291e5 100644 --- a/clients/gtest/syrkx_gtest.cpp +++ b/clients/gtest/syrkx_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -183,6 +183,8 @@ TEST_P(blas2_syr2k_gtest, syr2k_gtest_double_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + // syr2k_batched TEST_P(blas2_syr2k_gtest, syr2k_batched_gtest_float) { @@ -207,7 +209,7 @@ TEST_P(blas2_syr2k_gtest, syr2k_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -235,7 +237,7 @@ TEST_P(blas2_syr2k_gtest, syr2k_batched_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -264,7 +266,7 @@ TEST_P(blas2_syr2k_gtest, syr2k_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -292,11 +294,13 @@ TEST_P(blas2_syr2k_gtest, syr2k_strided_batched_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/tbmv_gtest.cpp b/clients/gtest/tbmv_gtest.cpp index f1a68aa4f..d1e39d1cd 100644 --- a/clients/gtest/tbmv_gtest.cpp +++ b/clients/gtest/tbmv_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -168,6 +168,8 @@ TEST_P(blas2_tbmv_gtest, tbmv_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(blas2_tbmv_gtest, tbmv_batched_float) { Arguments arg = setup_tbmv_arguments(GetParam()); @@ -183,7 +185,7 @@ TEST_P(blas2_tbmv_gtest, tbmv_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -203,11 +205,13 @@ TEST_P(blas2_tbmv_gtest, tbmv_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/tbsv_gtest.cpp b/clients/gtest/tbsv_gtest.cpp index 3b8969464..f9214fd6b 100644 --- a/clients/gtest/tbsv_gtest.cpp +++ b/clients/gtest/tbsv_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -168,6 +168,8 @@ TEST_P(blas2_tbsv_gtest, tbsv_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(blas2_tbsv_gtest, tbsv_batched_float) { Arguments arg = setup_tbsv_arguments(GetParam()); @@ -183,7 +185,7 @@ TEST_P(blas2_tbsv_gtest, tbsv_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -203,11 +205,13 @@ TEST_P(blas2_tbsv_gtest, tbsv_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/tpmv_gtest.cpp b/clients/gtest/tpmv_gtest.cpp index fe8aa9625..1d7a1ad9b 100644 --- a/clients/gtest/tpmv_gtest.cpp +++ b/clients/gtest/tpmv_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -166,6 +166,8 @@ TEST_P(blas2_tpmv_gtest, tpmv_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(blas2_tpmv_gtest, tpmv_batched_float) { Arguments arg = setup_tpmv_arguments(GetParam()); @@ -181,7 +183,7 @@ TEST_P(blas2_tpmv_gtest, tpmv_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -201,11 +203,13 @@ TEST_P(blas2_tpmv_gtest, tpmv_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/tpsv_gtest.cpp b/clients/gtest/tpsv_gtest.cpp index f5f4bb63d..3f06cd435 100644 --- a/clients/gtest/tpsv_gtest.cpp +++ b/clients/gtest/tpsv_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -154,6 +154,8 @@ TEST_P(blas2_tpsv_gtest, tpsv_double_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(blas2_tpsv_gtest, tpsv_batched_float) { Arguments arg = setup_tpsv_arguments(GetParam()); @@ -169,7 +171,7 @@ TEST_P(blas2_tpsv_gtest, tpsv_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -189,7 +191,7 @@ TEST_P(blas2_tpsv_gtest, tpsv_batched_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -209,7 +211,7 @@ TEST_P(blas2_tpsv_gtest, tpsv_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -229,11 +231,13 @@ TEST_P(blas2_tpsv_gtest, tpsv_strided_batched_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/trmm_gtest.cpp b/clients/gtest/trmm_gtest.cpp index e435d6490..368e51da8 100644 --- a/clients/gtest/trmm_gtest.cpp +++ b/clients/gtest/trmm_gtest.cpp @@ -213,6 +213,8 @@ TEST_P(trmm_gtest, trmm_gtest_double_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(trmm_gtest, trmm_batched_gtest_float) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -235,7 +237,7 @@ TEST_P(trmm_gtest, trmm_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -262,7 +264,7 @@ TEST_P(trmm_gtest, trmm_batched_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -289,7 +291,7 @@ TEST_P(trmm_gtest, trmm_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -316,11 +318,13 @@ TEST_P(trmm_gtest, trmm_strided_batched_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/trmv_gtest.cpp b/clients/gtest/trmv_gtest.cpp index acfd6dca3..d76337714 100644 --- a/clients/gtest/trmv_gtest.cpp +++ b/clients/gtest/trmv_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -167,6 +167,8 @@ TEST_P(blas2_trmv_gtest, trmv_double) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(blas2_trmv_gtest, trmv_batched_float) { Arguments arg = setup_trmv_arguments(GetParam()); @@ -182,7 +184,7 @@ TEST_P(blas2_trmv_gtest, trmv_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -202,11 +204,13 @@ TEST_P(blas2_trmv_gtest, trmv_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/trsm_ex_gtest.cpp b/clients/gtest/trsm_ex_gtest.cpp index ddc505394..6b830f645 100644 --- a/clients/gtest/trsm_ex_gtest.cpp +++ b/clients/gtest/trsm_ex_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -133,6 +133,8 @@ class trsm_ex_gtest : public ::TestWithParam virtual void TearDown() {} }; +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(trsm_ex_gtest, trsm_ex_gtest_float) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -163,7 +165,7 @@ TEST_P(trsm_ex_gtest, trsm_ex_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -191,7 +193,7 @@ TEST_P(trsm_ex_gtest, trsm_gtest_ex_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -219,7 +221,7 @@ TEST_P(trsm_ex_gtest, trsm_batched_ex_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -247,7 +249,7 @@ TEST_P(trsm_ex_gtest, trsm_batched_ex_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -274,7 +276,7 @@ TEST_P(trsm_ex_gtest, trsm_strided_batched_ex_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -301,11 +303,13 @@ TEST_P(trsm_ex_gtest, trsm_strided_batched_ex_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/trsm_gtest.cpp b/clients/gtest/trsm_gtest.cpp index ad1e4e97b..c3b6d330c 100644 --- a/clients/gtest/trsm_gtest.cpp +++ b/clients/gtest/trsm_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -274,6 +274,8 @@ TEST_P(trsm_gtest, trsm_batched_gtest_double_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(trsm_gtest, trsm_strided_batched_gtest_float) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -295,7 +297,7 @@ TEST_P(trsm_gtest, trsm_strided_batched_gtest_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -321,11 +323,13 @@ TEST_P(trsm_gtest, trsm_strided_batched_gtest_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/trsv_gtest.cpp b/clients/gtest/trsv_gtest.cpp index 128cce76d..438b03c15 100644 --- a/clients/gtest/trsv_gtest.cpp +++ b/clients/gtest/trsv_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -155,6 +155,8 @@ TEST_P(blas2_trsv_gtest, trsv_double_complex) } } +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(blas2_trsv_gtest, trsv_batched_float) { Arguments arg = setup_trsv_arguments(GetParam()); @@ -170,7 +172,7 @@ TEST_P(blas2_trsv_gtest, trsv_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -190,7 +192,7 @@ TEST_P(blas2_trsv_gtest, trsv_batched_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -210,7 +212,7 @@ TEST_P(blas2_trsv_gtest, trsv_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -230,11 +232,13 @@ TEST_P(blas2_trsv_gtest, trsv_strided_batched_double_complex) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector // so each elment in xxx_range is a avector, // ValuesIn take each element (a vector) and combine them and feed them to test_p diff --git a/clients/gtest/trtri_gtest.cpp b/clients/gtest/trtri_gtest.cpp index 8b0f1466f..9a2b51980 100644 --- a/clients/gtest/trtri_gtest.cpp +++ b/clients/gtest/trtri_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -101,6 +101,8 @@ class trtri_gtest : public ::TestWithParam virtual void TearDown() {} }; +#ifndef __HIP_PLATFORM_NVCC__ + TEST_P(trtri_gtest, trtri_float) { // GetParam return a tuple. Tee setup routine unpack the tuple @@ -121,7 +123,7 @@ TEST_P(trtri_gtest, trtri_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -147,7 +149,7 @@ TEST_P(trtri_gtest, trtri_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } @@ -173,11 +175,13 @@ TEST_P(trtri_gtest, trtri_strided_batched_float) } else { - EXPECT_EQ(HIPBLAS_STATUS_NOT_SUPPORTED, status); // for cuda + EXPECT_EQ(HIPBLAS_STATUS_SUCCESS, status); // fail } } } +#endif + // notice we are using vector of vector for matrix size, and vector for uplo, diag // ValuesIn take each element (a vector or a char) and combine them and feed them to test_p // The combinations are { {N, lda}, uplo, diag } diff --git a/clients/include/testing_her.hpp b/clients/include/testing_her.hpp index 16218d780..3af028256 100644 --- a/clients/include/testing_her.hpp +++ b/clients/include/testing_her.hpp @@ -91,7 +91,12 @@ hipblasStatus_t testing_her(const Arguments& argus) if(argus.unit_check) { unit_check_general(N, N, lda, hA_cpu.data(), hA_host.data()); - unit_check_general(N, N, lda, hA_cpu.data(), hA_device.data()); + + // NOTE: on cuBLAS, with alpha == 0 and alpha on the device, there is not a quick-return, + // instead, the imaginary part of the diagonal elements are set to 0. in rocBLAS, we are quick-returning + // as well as in our reference code. For this reason, I've disabled the check here. + if(h_alpha) + unit_check_general(N, N, lda, hA_cpu.data(), hA_device.data()); } if(argus.norm_check) { diff --git a/clients/include/testing_her2.hpp b/clients/include/testing_her2.hpp index 64c3d2329..03cad6a23 100644 --- a/clients/include/testing_her2.hpp +++ b/clients/include/testing_her2.hpp @@ -95,7 +95,12 @@ hipblasStatus_t testing_her2(const Arguments& argus) if(argus.unit_check) { unit_check_general(N, N, lda, hA_cpu.data(), hA_host.data()); - unit_check_general(N, N, lda, hA_cpu.data(), hA_device.data()); + + // NOTE: on cuBLAS, with alpha == 0 and alpha on the device, there is not a quick-return, + // instead, the imaginary part of the diagonal elements are set to 0. in rocBLAS, we are quick-returning + // as well as in our reference code. For this reason, I've disabled the check here. + if(h_alpha != 0) + unit_check_general(N, N, lda, hA_cpu.data(), hA_device.data()); } if(argus.norm_check) { From 8003b0b32bbde34291067d83b11b8bb3306e614f Mon Sep 17 00:00:00 2001 From: daineAMD Date: Mon, 29 Mar 2021 12:23:29 -0600 Subject: [PATCH 8/8] Revert "Merge pull request #320 from ROCmSoftwarePlatform/revert-319-master" This reverts commit 00b0358511b0975af4ac87a0a3af0219c203479f, reversing changes made to f0399256d8fd41f0e11bbecbed4655cbadee0305. --- CHANGELOG.md | 4 + CMakeLists.txt | 2 +- clients/benchmarks/client.cpp | 93 +++++----- clients/common/utility.cpp | 42 +++++ clients/gtest/blas1_gtest.cpp | 18 +- clients/gtest/trmm_gtest.cpp | 4 +- clients/include/bytes.hpp | 7 + clients/include/flops.hpp | 33 +--- clients/include/hipblas_vector.hpp | 40 ++++- clients/include/norm.h | 38 ++++ clients/include/testing_asum.hpp | 124 ++++++++----- clients/include/testing_asum_batched.hpp | 128 +++++++++----- .../include/testing_asum_strided_batched.hpp | 115 ++++++++---- clients/include/testing_axpy.hpp | 108 ++++++++--- clients/include/testing_axpy_batched.hpp | 167 +++++++++++------- .../include/testing_axpy_strided_batched.hpp | 114 +++++++++--- clients/include/testing_dot.hpp | 110 +++++++----- clients/include/testing_dot_batched.hpp | 167 ++++++++++-------- .../include/testing_dot_strided_batched.hpp | 96 ++++++---- clients/include/testing_gemv.hpp | 34 +--- clients/include/testing_scal_batched_ex.hpp | 60 ++++++- clients/include/testing_scal_ex.hpp | 61 +++++-- .../testing_scal_strided_batched_ex.hpp | 59 ++++++- clients/include/testing_trmm.hpp | 59 ++++++- clients/include/testing_trmm_batched.hpp | 97 +++++++--- .../include/testing_trmm_strided_batched.hpp | 117 +++++++++--- clients/include/type_dispatch.hpp | 15 +- clients/include/utility.h | 41 +++++ library/CMakeLists.txt | 4 +- library/include/hipblas.h | 16 +- library/src/CMakeLists.txt | 3 +- library/src/hipblas_auxiliary.cpp | 29 +++ 32 files changed, 1417 insertions(+), 588 deletions(-) create mode 100644 library/src/hipblas_auxiliary.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index fb58da4a9..8e0151368 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Change Log for hipBLAS +## [hipBLAS 0.45.0 for ROCm 4.3.0] +### Added +- Added hipblasStatusToString + ## [hipBLAS 0.44.0 for ROCm 4.2.0] ### Added - Made necessary changes to work with rocBLAS' gemm_ex changes. When using rocBLAS backend, hipBLAS will query the preferable diff --git a/CMakeLists.txt b/CMakeLists.txt index da9cb52ee..d103c2f91 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,7 @@ include( ROCMInstallTargets ) include( ROCMPackageConfigHelpers ) include( ROCMInstallSymlinks ) -set ( VERSION_STRING "0.44.0" ) +set ( VERSION_STRING "0.45.0" ) rocm_setup_version( VERSION ${VERSION_STRING} ) if( NOT DEFINED $ENV{HIP_PATH}) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 686587da5..3016aecdc 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -62,10 +62,10 @@ #include "testing_rotmg_strided_batched.hpp" #include "testing_scal.hpp" #include "testing_scal_batched.hpp" -// #include "testing_scal_batched_ex.hpp" -// #include "testing_scal_ex.hpp" +#include "testing_scal_batched_ex.hpp" +#include "testing_scal_ex.hpp" #include "testing_scal_strided_batched.hpp" -// #include "testing_scal_strided_batched_ex.hpp" +#include "testing_scal_strided_batched_ex.hpp" #include "testing_swap.hpp" #include "testing_swap_batched.hpp" #include "testing_swap_strided_batched.hpp" @@ -318,9 +318,18 @@ struct perf_blas{} || std::is_same void operator()(const Arguments& arg) { static const func_map fmap = { + {"asum", testing_asum}, + {"asum_batched", testing_asum_batched}, + {"asum_strided_batched", testing_asum_strided_batched}, + {"axpy", testing_axpy}, + {"axpy_batched", testing_axpy_batched}, + {"axpy_strided_batched", testing_axpy_strided_batched}, {"copy", testing_copy}, {"copy_batched", testing_copy_batched}, {"copy_strided_batched", testing_copy_strided_batched}, + {"dot", testing_dot}, + {"dot_batched", testing_dot_batched}, + {"dot_strided_batched", testing_dot_strided_batched}, {"swap", testing_swap}, {"swap_batched", testing_swap_batched}, {"swap_strided_batched", testing_swap_strided_batched}, @@ -331,15 +340,6 @@ struct perf_blas{} || std::is_same {"set_get_matrix", testing_set_get_matrix}, {"set_get_matrix_async", testing_set_get_matrix_async}, // L1 - {"asum", testing_asum}, - {"asum_batched", testing_asum_batched}, - {"asum_strided_batched", testing_asum_strided_batched}, - {"axpy", testing_axpy}, - {"axpy_batched", testing_axpy_batched}, - {"axpy_strided_batched", testing_axpy_strided_batched}, - {"dot", testing_dot}, - {"dot_batched", testing_dot_batched}, - {"dot_strided_batched", testing_dot_strided_batched}, {"iamax", testing_iamax}, {"iamax_batched", testing_iamax_batched}, {"iamax_strided_batched", testing_iamax_strided_batched}, @@ -416,13 +416,14 @@ struct perf_blas{} || std::is_same {"syrkx", testing_syr2k}, {"syrkx_batched", testing_syr2k_batched}, {"syrkx_strided_batched", testing_syr2k_strided_batched}, - {"trmm", testing_trmm}, - {"trmm_batched", testing_trmm_batched}, - {"trmm_strided_batched", testing_trmm_strided_batched}, + {"trtri", testing_trtri}, {"trtri_batched", testing_trtri_batched}, {"trtri_strided_batched", testing_trtri_strided_batched}, */ + {"trmm", testing_trmm}, + {"trmm_batched", testing_trmm_batched}, + {"trmm_strided_batched", testing_trmm_strided_batched}, {"gemm", testing_gemm}, {"gemm_batched", testing_gemm_batched}, {"gemm_strided_batched", testing_gemm_strided_batched}, @@ -453,11 +454,9 @@ struct perf_blas{}>> : h void operator()(const Arguments& arg) { static const func_map map = { - /* {"dot", testing_dot}, {"dot_batched", testing_dot_batched}, {"dot_strided_batched", testing_dot_strided_batched}, - */ }; run_function(map, arg); } @@ -469,13 +468,12 @@ struct perf_blas{}>> : hipbl void operator()(const Arguments& arg) { static const func_map map = { - /*{"axpy", testing_axpy}, - {"axpy_batched", testing_axpy_batched}, - {"axpy_strided_batched", testing_axpy_strided_batched}, - {"dot", testing_dot}, - {"dot_batched", testing_dot_batched}, - {"dot_strided_batched", testing_dot_strided_batched}, - */ + {"axpy", testing_axpy}, + {"axpy_batched", testing_axpy_batched}, + {"axpy_strided_batched", testing_axpy_strided_batched}, + {"dot", testing_dot}, + {"dot_batched", testing_dot_batched}, + {"dot_strided_batched", testing_dot_strided_batched}, {"gemm", testing_gemm}, {"gemm_batched", testing_gemm_batched}, {"gemm_strided_batched", testing_gemm_strided_batched}, @@ -495,27 +493,28 @@ struct perf_blas< void operator()(const Arguments& arg) { static const func_map map = { + {"asum", testing_asum}, + {"asum_batched", testing_asum_batched}, + {"asum_strided_batched", testing_asum_strided_batched}, + {"axpy", testing_axpy}, + {"axpy_batched", testing_axpy_batched}, + {"axpy_strided_batched", testing_axpy_strided_batched}, {"copy", testing_copy}, {"copy_batched", testing_copy_batched}, {"copy_strided_batched", testing_copy_strided_batched}, + {"dot", testing_dot}, + {"dot_batched", testing_dot_batched}, + {"dot_strided_batched", testing_dot_strided_batched}, + {"dotc", testing_dotc}, + {"dotc_batched", testing_dotc_batched}, + {"dotc_strided_batched", testing_dotc_strided_batched}, {"swap", testing_swap}, {"swap_batched", testing_swap_batched}, {"swap_strided_batched", testing_swap_strided_batched}, {"scal", testing_scal}, {"scal_batched", testing_scal_batched}, {"scal_strided_batched", testing_scal_strided_batched}, - /* {"asum", testing_asum}, - {"asum_batched", testing_asum_batched}, - {"asum_strided_batched", testing_asum_strided_batched}, - {"axpy", testing_axpy}, - {"axpy_batched", testing_axpy_batched}, - {"axpy_strided_batched", testing_axpy_strided_batched}, - {"dot", testing_dot}, - {"dot_batched", testing_dot_batched}, - {"dot_strided_batched", testing_dot_strided_batched}, - {"dotc", testing_dotc}, - {"dotc_batched", testing_dotc_batched}, - {"dotc_strided_batched", testing_dotc_strided_batched}, + /* {"iamax", testing_iamax}, {"iamax_batched", testing_iamax_batched}, {"iamax_strided_batched", testing_iamax_strided_batched}, @@ -634,10 +633,9 @@ struct perf_blas< {"trsv", testing_trsv}, {"trsv_batched", testing_trsv_batched}, {"trsv_strided_batched", testing_trsv_strided_batched}, - /*{"trmm", testing_trmm}, + {"trmm", testing_trmm}, {"trmm_batched", testing_trmm_batched}, {"trmm_strided_batched", testing_trmm_strided_batched}, - */ }; run_function(map, arg); } @@ -758,6 +756,7 @@ struct perf_blas_scal_ex< || (std::is_same{} && std::is_same{} && std::is_same{}) || (std::is_same{} && std::is_same{} && std::is_same{}) + || (std::is_same{} && std::is_same{} && std::is_same{}) || (std::is_same{} && std::is_same{} && std::is_same{}) || (std::is_same{} && std::is_same{} @@ -766,9 +765,9 @@ struct perf_blas_scal_ex< void operator()(const Arguments& arg) { static const func_map map = { - // {"scal_ex", testing_scal_ex}, - // {"scal_batched_ex", testing_scal_batched_ex}, - // {"scal_strided_batched_ex", testing_scal_strided_batched_ex}, + {"scal_ex", testing_scal_ex_template}, + {"scal_batched_ex", testing_scal_batched_ex_template}, + {"scal_strided_batched_ex", testing_scal_strided_batched_ex_template}, }; run_function(map, arg); } @@ -962,6 +961,9 @@ int run_bench_test(Arguments& arg) } else { + if(!strcmp(function, "scal_ex") || !strcmp(function, "scal_batched_ex") + || !strcmp(function, "scal_strided_batched_ex")) + hipblas_blas1_ex_dispatch(arg); /* if(!strcmp(function, "scal") || !strcmp(function, "scal_batched") || !strcmp(function, "scal_strided_batched")) @@ -974,13 +976,10 @@ int run_bench_test(Arguments& arg) hipblas_blas1_dispatch(arg); else if(!strcmp(function, "axpy_ex") || !strcmp(function, "axpy_batched_ex") || !strcmp(function, "axpy_strided_batched_ex")) - hipblas_blas1_ex_dispatch(arg); - else if(!strcmp(function, "scal_ex") || !strcmp(function, "scal_batched_ex") - || !strcmp(function, "scal_strided_batched_ex")) - hipblas_blas1_ex_dispatch(arg); + hipblas_blas1_ex_dispatch(arg);*/ + else - */ - hipblas_simple_dispatch(arg); + hipblas_simple_dispatch(arg); } return 0; } diff --git a/clients/common/utility.cpp b/clients/common/utility.cpp index cf8fa579d..bab72b7c1 100644 --- a/clients/common/utility.cpp +++ b/clients/common/utility.cpp @@ -75,6 +75,48 @@ std::string hipblas_exepath() return pathstr; } +/***************** + * local handles * + *****************/ + +hipblasLocalHandle::hipblasLocalHandle() +{ + auto status = hipblasCreate(&m_handle); + if(status != HIPBLAS_STATUS_SUCCESS) + throw std::runtime_error(hipblasStatusToString(status)); +} + +hipblasLocalHandle::hipblasLocalHandle(const Arguments& arg) + : hipblasLocalHandle() +{ + // for future customization of handle based on arguments, example from rocblas below + + /* + auto status = rocblas_set_atomics_mode(m_handle, arg.atomics_mode); + + if(status == rocblas_status_success) + { + // If the test specifies user allocated workspace, allocate and use it + if(arg.user_allocated_workspace) + { + if((hipMalloc)(&m_memory, arg.user_allocated_workspace) != hipSuccess) + throw std::bad_alloc(); + status = rocblas_set_workspace(m_handle, m_memory, arg.user_allocated_workspace); + } + } + + if(status != rocblas_status_success) + throw std::runtime_error(rocblas_status_to_string(status)); + */ +} + +hipblasLocalHandle::~hipblasLocalHandle() +{ + if(m_memory) + (hipFree)(m_memory); + hipblasDestroy(m_handle); +} + #ifdef __cplusplus extern "C" { #endif diff --git a/clients/gtest/blas1_gtest.cpp b/clients/gtest/blas1_gtest.cpp index 2c1c48b2a..c2c56d49d 100644 --- a/clients/gtest/blas1_gtest.cpp +++ b/clients/gtest/blas1_gtest.cpp @@ -1934,7 +1934,7 @@ TEST_P(blas1_gtest, asum_float) // The Arguments data struture have physical meaning associated. // while the tuple is non-intuitive. Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_asum(arg); + hipblasStatus_t status = testing_asum(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -1956,7 +1956,7 @@ TEST_P(blas1_gtest, asum_float) TEST_P(blas1_gtest, asum_float_complex) { Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_asum(arg); + hipblasStatus_t status = testing_asum(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -1978,7 +1978,7 @@ TEST_P(blas1_gtest, asum_float_complex) TEST_P(blas1_gtest, asum_double_complex) { Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_asum(arg); + hipblasStatus_t status = testing_asum(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -2001,7 +2001,7 @@ TEST_P(blas1_gtest, asum_double_complex) TEST_P(blas1_gtest, asum_batched_float) { Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_asum_batched(arg); + hipblasStatus_t status = testing_asum_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -2027,7 +2027,7 @@ TEST_P(blas1_gtest, asum_batched_float) TEST_P(blas1_gtest, asum_batched_float_complex) { Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_asum_batched(arg); + hipblasStatus_t status = testing_asum_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -2053,7 +2053,7 @@ TEST_P(blas1_gtest, asum_batched_float_complex) TEST_P(blas1_gtest, asum_batched_double_complex) { Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_asum_batched(arg); + hipblasStatus_t status = testing_asum_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -2080,7 +2080,7 @@ TEST_P(blas1_gtest, asum_batched_double_complex) TEST_P(blas1_gtest, asum_strided_batched_float) { Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_asum_strided_batched(arg); + hipblasStatus_t status = testing_asum_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -2106,7 +2106,7 @@ TEST_P(blas1_gtest, asum_strided_batched_float) TEST_P(blas1_gtest, asum_strided_batched_float_complex) { Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_asum_strided_batched(arg); + hipblasStatus_t status = testing_asum_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { @@ -2132,7 +2132,7 @@ TEST_P(blas1_gtest, asum_strided_batched_float_complex) TEST_P(blas1_gtest, asum_strided_batched_double_complex) { Arguments arg = setup_blas1_arguments(GetParam()); - hipblasStatus_t status = testing_asum_strided_batched(arg); + hipblasStatus_t status = testing_asum_strided_batched(arg); // if not success, then the input argument is problematic, so detect the error message if(status != HIPBLAS_STATUS_SUCCESS) { diff --git a/clients/gtest/trmm_gtest.cpp b/clients/gtest/trmm_gtest.cpp index 51e28222a..e435d6490 100644 --- a/clients/gtest/trmm_gtest.cpp +++ b/clients/gtest/trmm_gtest.cpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -140,7 +140,7 @@ Arguments setup_trmm_arguments(trmm_tuple tup) arg.transA_option = side_uplo_transA_diag[2]; arg.diag_option = side_uplo_transA_diag[3]; - arg.timing = 1; + arg.timing = 0; arg.stride_scale = stride_scale; arg.batch_count = batch_count; diff --git a/clients/include/bytes.hpp b/clients/include/bytes.hpp index 422b7a75f..b9993675e 100644 --- a/clients/include/bytes.hpp +++ b/clients/include/bytes.hpp @@ -199,6 +199,13 @@ constexpr double gemm_gbyte_count(int m, int n, int k) return (sizeof(T) * (m * k + n * k + m * n)) / 1e9; } +/* \brief byte counts of TRMM */ +template +constexpr double trmm_gbyte_count(int m, int n, int k) +{ + return (sizeof(T) * (m * n * 2 + k * k / 2)) / 1e9; +} + /* \brief byte counts of TRSM */ template constexpr double trsm_gbyte_count(int m, int n, int k) diff --git a/clients/include/flops.hpp b/clients/include/flops.hpp index 644b34de5..fb5ee1694 100644 --- a/clients/include/flops.hpp +++ b/clients/include/flops.hpp @@ -720,42 +720,21 @@ constexpr double syrkx_gflop_count(int n, int k) /* \brief floating point counts of TRSM */ template -constexpr double trmm_gflop_count(int m, int n, hipblasSideMode_t side) +constexpr double trmm_gflop_count(int m, int n, int k) { - if(HIPBLAS_SIDE_LEFT == side) - { - return (1.0 * m * n * (m + 1)) / 1e9; - } - else - { - return (1.0 * m * n * (n + 1)) / 1e9; - } + return (1.0 * m * n * k) / 1e9; } template <> -constexpr double trmm_gflop_count(int m, int n, hipblasSideMode_t side) +constexpr double trmm_gflop_count(int m, int n, int k) { - if(HIPBLAS_SIDE_LEFT == side) - { - return 4 * (1.0 * m * n * (m + 1)) / 1e9; - } - else - { - return 4 * (1.0 * m * n * (n + 1)) / 1e9; - } + return 4 * (1.0 * m * n * k) / 1e9; } template <> -constexpr double trmm_gflop_count(int m, int n, hipblasSideMode_t side) +constexpr double trmm_gflop_count(int m, int n, int k) { - if(HIPBLAS_SIDE_LEFT == side) - { - return (1.0 * m * n * (m + 1)) / 1e9; - } - else - { - return (1.0 * m * n * (n + 1)) / 1e9; - } + return 4 * (1.0 * m * n * k) / 1e9; } /* \brief floating point counts of TRSM */ diff --git a/clients/include/hipblas_vector.hpp b/clients/include/hipblas_vector.hpp index 2e37e8164..d04314052 100644 --- a/clients/include/hipblas_vector.hpp +++ b/clients/include/hipblas_vector.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2018-2020 Advanced Micro Devices, Inc. + * Copyright 2018-2021 Advanced Micro Devices, Inc. * ************************************************************************ */ #ifndef HIPBLAS_VECTOR_H_ @@ -114,4 +114,42 @@ inline void hipblas_init(host_batch_vector& that, bool seedReset = false) hipblas_init_template(that, random_generator, seedReset); } +//! +//! @brief Template for initializing a host (non_batched|batched|strided_batched)vector. +//! @param that That vector. +//! @param rand_gen The random number generator for odd elements +//! @param rand_gen_alt The random number generator for even elements +//! @param seedReset Reset the seed if true, do not reset the seed otherwise. +//! +template +void hipblas_init_alternating_template(U& that, T rand_gen(), T rand_gen_alt(), bool seedReset) +{ + if(seedReset) + hipblas_seedrand(); + + for(int b = 0; b < that.batch_count(); ++b) + { + auto* batched_data = that[b]; + ptrdiff_t inc = that.inc(); + auto n = that.n(); + if(inc < 0) + batched_data -= (n - 1) * inc; + + for(int i = 0; i < n; ++i) + { + if(i % 2) + batched_data[i * inc] = rand_gen(); + else + batched_data[i * inc] = rand_gen_alt(); + } + } +} + +template +void hipblas_init_alternating_sign(host_batch_vector& that, bool seedReset = false) +{ + hipblas_init_alternating_template( + that, random_generator, random_generator_negative, seedReset); +} + #endif diff --git a/clients/include/norm.h b/clients/include/norm.h index 1812c3c85..ab4b22684 100644 --- a/clients/include/norm.h +++ b/clients/include/norm.h @@ -106,6 +106,44 @@ double norm_check_general( return cumulative_error; } +template +double norm_check_general(char norm_type, + int M, + int N, + int lda, + host_batch_vector& hCPU, + host_batch_vector& hGPU, + int batch_count) +{ + // norm type can be O', 'I', 'F', 'o', 'i', 'f' for one, infinity or Frobenius norm + // one norm is max column sum + // infinity norm is max row sum + // Frobenius is l2 norm of matrix entries + // + // use triangle inequality ||a+b|| <= ||a|| + ||b|| to calculate upper limit for Frobenius norm + // of strided batched matrix + + double cumulative_error = 0.0; + + for(int i = 0; i < batch_count; i++) + { + auto index = i; + + auto error = norm_check_general(norm_type, M, N, lda, hCPU[index], hGPU[index]); + + if(norm_type == 'F' || norm_type == 'f') + { + cumulative_error += error; + } + else if(norm_type == 'O' || norm_type == 'o' || norm_type == 'I' || norm_type == 'i') + { + cumulative_error = cumulative_error > error ? cumulative_error : error; + } + } + + return cumulative_error; +} + template double vector_norm_1(int M, int incx, T* hx_gold, T* hx) { diff --git a/clients/include/testing_asum.hpp b/clients/include/testing_asum.hpp index d0e85df81..41c74312a 100644 --- a/clients/include/testing_asum.hpp +++ b/clients/include/testing_asum.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -13,15 +13,20 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_asum(const Arguments& argus) { + using Tr = real_t; + bool FORTRAN = argus.fortran; + auto hipblasAsumFn = FORTRAN ? hipblasAsum : hipblasAsum; int N = argus.N; int incx = argus.incx; hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; // check to prevent undefined memory allocation error if(N < 0 || incx < 0) @@ -33,87 +38,114 @@ hipblasStatus_t testing_asum(const Arguments& argus) int sizeX = N * incx; // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - vector hx(sizeX); + host_vector hx(sizeX); - T1* dx; - T2* d_rocblas_result; - T2 cpu_result, rocblas_result; + device_vector dx(sizeX); + device_vector d_hipblas_result(1); + Tr cpu_result, hipblas_result_host, hipblas_result_device; - int device_pointer = 1; - - double gpu_time_used, cpu_time_used; - double rocblas_error; + double gpu_time_used, hipblas_error_host = 0, hipblas_error_device = 0; hipblasHandle_t handle; hipblasCreate(&handle); - // allocate memory on device - CHECK_HIP_ERROR(hipMalloc(&dx, sizeX * sizeof(T1))); - CHECK_HIP_ERROR(hipMalloc(&d_rocblas_result, sizeof(T2))); - // Initial Data on CPU srand(1); - hipblas_init(hx, 1, N, incx); + hipblas_init(hx, 1, N, incx); // copy data from CPU to device, does not work for incx != 1 - CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T1) * N * incx, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS - =================================================================== */ - /* ===================================================================== - CPU BLAS + HIPBLAS =================================================================== */ // hipblasAsum accept both dev/host pointer for the scalar - if(device_pointer) - { + status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + status_2 = hipblasAsumFn(handle, N, dx, incx, d_hipblas_result); - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); + status_4 = hipblasAsumFn(handle, N, dx, incx, &hipblas_result_host); - status_2 = hipblasAsum(handle, N, dx, incx, d_rocblas_result); - } - else + if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) + || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) { - - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - - status_2 = hipblasAsum(handle, N, dx, incx, &rocblas_result); - } - - if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) - { - CHECK_HIP_ERROR(hipFree(dx)); - CHECK_HIP_ERROR(hipFree(d_rocblas_result)); hipblasDestroy(handle); if(status_1 != HIPBLAS_STATUS_SUCCESS) return status_1; if(status_2 != HIPBLAS_STATUS_SUCCESS) return status_2; + if(status_3 != HIPBLAS_STATUS_SUCCESS) + return status_3; + if(status_4 != HIPBLAS_STATUS_SUCCESS) + return status_4; } - if(device_pointer) - CHECK_HIP_ERROR( - hipMemcpy(&rocblas_result, d_rocblas_result, sizeof(T2), hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(&hipblas_result_device, d_hipblas_result, sizeof(Tr), hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { - /* ===================================================================== CPU BLAS =================================================================== */ - cblas_asum(N, hx.data(), incx, &cpu_result); + cblas_asum(N, hx.data(), incx, &cpu_result); if(argus.unit_check) { - unit_check_general(1, 1, 1, &cpu_result, &rocblas_result); + unit_check_general(1, 1, 1, &cpu_result, &hipblas_result_host); + unit_check_general(1, 1, 1, &cpu_result, &hipblas_result_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('M', 1, 1, 1, &cpu_result, &hipblas_result_host); + hipblas_error_device + = norm_check_general('M', 1, 1, 1, &cpu_result, &hipblas_result_device); } } // end of if unit/norm check - // BLAS_1_RESULT_PRINT - CHECK_HIP_ERROR(hipFree(dx)); - CHECK_HIP_ERROR(hipFree(d_rocblas_result)); + if(argus.timing) + { + hipStream_t stream; + status_1 = hipblasGetStream(handle, &stream); + status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + + if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) + { + hipblasDestroy(handle); + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status_1 = hipblasAsumFn(handle, N, dx, incx, d_hipblas_result); + + if(status_1 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status_1; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + asum_gflop_count(N), + asum_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } + hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_asum_batched.hpp b/clients/include/testing_asum_batched.hpp index a3418f29a..db2f34fcf 100644 --- a/clients/include/testing_asum_batched.hpp +++ b/clients/include/testing_asum_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -13,9 +13,14 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_asum_batched(const Arguments& argus) { + using Tr = real_t; + bool FORTRAN = argus.fortran; + auto hipblasAsumBatchedFn + = FORTRAN ? hipblasAsumBatched : hipblasAsumBatched; + int N = argus.N; int incx = argus.incx; int batch_count = argus.batch_count; @@ -37,56 +42,36 @@ hipblasStatus_t testing_asum_batched(const Arguments& argus) int sizeX = N * incx; - double gpu_time_used, cpu_time_used; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; hipblasHandle_t handle; hipblasCreate(&handle); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - host_vector hx_array[batch_count]; - host_vector h_rocblas_result1(batch_count); - host_vector h_rocblas_result2(batch_count); - host_vector h_cpu_result(batch_count); - - device_batch_vector bx_array(batch_count, sizeX); - device_vector dx_array(batch_count); - device_vector d_rocblas_result(batch_count); + host_batch_vector hx(N, incx, batch_count); + host_vector h_hipblas_result_host(batch_count); + host_vector h_hipblas_result_device(batch_count); + host_vector h_cpu_result(batch_count); - int device_pointer = 1; - int host_pointer = 1; + device_batch_vector dx(N, incx, batch_count); + device_vector d_hipblas_result(batch_count); + CHECK_HIP_ERROR(dx.memcheck()); // Initial Data on CPU - srand(1); - for(int b = 0; b < batch_count; b++) - { - hx_array[b] = host_vector(sizeX); - - srand(1); - hipblas_init(hx_array[b], 1, N, incx); - - CHECK_HIP_ERROR( - hipMemcpy(bx_array[b], hx_array[b], sizeof(T1) * sizeX, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR( - hipMemcpy(dx_array, bx_array, sizeof(T1*) * batch_count, hipMemcpyHostToDevice)); + hipblas_init(hx, true); + CHECK_HIP_ERROR(dx.transfer_from(hx)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ // hipblasAsum accept both dev/host pointer for the scalar - if(device_pointer) - { - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 - = hipblasAsumBatched(handle, N, dx_array, incx, batch_count, d_rocblas_result); - } - if(host_pointer) - { - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 - = hipblasAsumBatched(handle, N, dx_array, incx, batch_count, h_rocblas_result1); - } + status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + status_2 + = hipblasAsumBatchedFn(handle, N, dx.ptr_on_device(), incx, batch_count, d_hipblas_result); + + status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); + status_4 = hipblasAsumBatchedFn( + handle, N, dx.ptr_on_device(), incx, batch_count, h_hipblas_result_host); if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS || (status_3 != HIPBLAS_STATUS_SUCCESS) @@ -103,29 +88,76 @@ hipblasStatus_t testing_asum_batched(const Arguments& argus) return status_4; } - if(device_pointer) - CHECK_HIP_ERROR(hipMemcpy( - h_rocblas_result2, d_rocblas_result, sizeof(T2) * batch_count, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(h_hipblas_result_device, + d_hipblas_result, + sizeof(Tr) * batch_count, + hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_asum(N, hx_array[b], incx, &(h_cpu_result[b])); + cblas_asum(N, hx[b], incx, &(h_cpu_result[b])); } if(argus.unit_check) { - unit_check_general(1, batch_count, 1, h_cpu_result, h_rocblas_result1); - unit_check_general(1, batch_count, 1, h_cpu_result, h_rocblas_result2); + unit_check_general(1, batch_count, 1, h_cpu_result, h_hipblas_result_host); + unit_check_general(1, batch_count, 1, h_cpu_result, h_hipblas_result_device); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general( + 'F', 1, batch_count, 1, h_cpu_result, h_hipblas_result_host); + hipblas_error_device = norm_check_general( + 'F', 1, batch_count, 1, h_cpu_result, h_hipblas_result_device); } } // end of if unit/norm check - // BLAS_1_RESULT_PRINT + if(argus.timing) + { + hipStream_t stream; + status_1 = hipblasGetStream(handle, &stream); + status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + if(status_1 != HIPBLAS_STATUS_SUCCESS || status_2 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status_1 = hipblasAsumBatchedFn( + handle, N, dx.ptr_on_device(), incx, batch_count, d_hipblas_result); + + if(status_1 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status_1; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + asum_gflop_count(N), + asum_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } + hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_asum_strided_batched.hpp b/clients/include/testing_asum_strided_batched.hpp index c84832fc0..da2d991ed 100644 --- a/clients/include/testing_asum_strided_batched.hpp +++ b/clients/include/testing_asum_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -13,9 +13,14 @@ using namespace std; /* ============================================================================================ */ -template +template hipblasStatus_t testing_asum_strided_batched(const Arguments& argus) { + using Tr = real_t; + bool FORTRAN = argus.fortran; + auto hipblasAsumStridedBatchedFn = FORTRAN ? hipblasAsumStridedBatched + : hipblasAsumStridedBatched; + int N = argus.N; int incx = argus.incx; double stride_scale = argus.stride_scale; @@ -29,8 +34,7 @@ hipblasStatus_t testing_asum_strided_batched(const Arguments& argus) hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; - double gpu_time_used, cpu_time_used; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; // check to prevent undefined memory allocation error if(N < 0 || incx < 0 || batch_count < 0) @@ -39,7 +43,7 @@ hipblasStatus_t testing_asum_strided_batched(const Arguments& argus) } if(batch_count == 0) { - // return early so we don't get invalid_value from rocblas because of bad result pointer + // return early so we don't get invalid_value from hipblas because of bad result pointer return HIPBLAS_STATUS_SUCCESS; } @@ -47,42 +51,32 @@ hipblasStatus_t testing_asum_strided_batched(const Arguments& argus) hipblasCreate(&handle); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - host_vector hx(sizeX); - host_vector cpu_result(batch_count); - host_vector rocblas_result1(batch_count); - host_vector rocblas_result2(batch_count); - - device_vector dx(sizeX); - device_vector d_rocblas_result(batch_count); + host_vector hx(sizeX); + host_vector cpu_result(batch_count); + host_vector hipblas_result_host(batch_count); + host_vector hipblas_result_device(batch_count); - int device_pointer = 1; - int host_pointer = 1; + device_vector dx(sizeX); + device_vector d_hipblas_result(batch_count); // Initial Data on CPU srand(1); - hipblas_init(hx, 1, N, incx, stridex, batch_count); + hipblas_init(hx, 1, N, incx, stridex, batch_count); // copy data from CPU to device, does not work for incx != 1 - CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T1) * sizeX, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * sizeX, hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ // hipblasAsum accept both dev/host pointer for the scalar + status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + status_2 + = hipblasAsumStridedBatchedFn(handle, N, dx, incx, stridex, batch_count, d_hipblas_result); - if(device_pointer) - { - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - status_2 = hipblasAsumStridedBatched( - handle, N, dx, incx, stridex, batch_count, d_rocblas_result); - } - - if(host_pointer) - { - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - status_4 = hipblasAsumStridedBatched( - handle, N, dx, incx, stridex, batch_count, rocblas_result1); - } + status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); + status_4 = hipblasAsumStridedBatchedFn( + handle, N, dx, incx, stridex, batch_count, hipblas_result_host); if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) @@ -98,29 +92,74 @@ hipblasStatus_t testing_asum_strided_batched(const Arguments& argus) return status_4; } - if(device_pointer) - CHECK_HIP_ERROR(hipMemcpy( - rocblas_result2, d_rocblas_result, sizeof(T2) * batch_count, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy( + hipblas_result_device, d_hipblas_result, sizeof(Tr) * batch_count, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_asum(N, hx.data() + b * stridex, incx, &cpu_result[b]); + cblas_asum(N, hx.data() + b * stridex, incx, &cpu_result[b]); } if(argus.unit_check) { - unit_check_general(1, batch_count, 1, cpu_result, rocblas_result1); - unit_check_general(1, batch_count, 1, cpu_result, rocblas_result2); + unit_check_general(1, batch_count, 1, cpu_result, hipblas_result_host); + unit_check_general(1, batch_count, 1, cpu_result, hipblas_result_device); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, batch_count, 1, cpu_result, hipblas_result_host); + hipblas_error_device + = norm_check_general('F', 1, batch_count, 1, cpu_result, hipblas_result_device); } } // end of if unit/norm check - // BLAS_1_RESULT_PRINT + if(argus.timing) + { + hipStream_t stream; + status_1 = hipblasGetStream(handle, &stream); + status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + if(status_1 != HIPBLAS_STATUS_SUCCESS || status_2 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status_1 = hipblasAsumStridedBatchedFn( + handle, N, dx, incx, stridex, batch_count, d_hipblas_result); + + if(status_1 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status_1; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + asum_gflop_count(N), + asum_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } + hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_axpy.hpp b/clients/include/testing_axpy.hpp index 91b2d099e..cdaabc30d 100644 --- a/clients/include/testing_axpy.hpp +++ b/clients/include/testing_axpy.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -23,7 +23,10 @@ hipblasStatus_t testing_axpy(const Arguments& argus) int incx = argus.incx; int incy = argus.incy; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; int abs_incx = incx < 0 ? -incx : incx; int abs_incy = incy < 0 ? -incy : incy; @@ -41,15 +44,17 @@ hipblasStatus_t testing_axpy(const Arguments& argus) // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice host_vector hx(sizeX); - host_vector hy(sizeY); + host_vector hy_host(sizeY); + host_vector hy_device(sizeY); host_vector hx_cpu(sizeX); host_vector hy_cpu(sizeY); device_vector dx(sizeX); - device_vector dy(sizeX); + device_vector dy_host(sizeY); + device_vector dy_device(sizeY); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double rocblas_error = 0.0; + double gpu_time_used, hipblas_error_host, hipblas_error_device; hipblasHandle_t handle; hipblasCreate(&handle); @@ -57,47 +62,108 @@ hipblasStatus_t testing_axpy(const Arguments& argus) // Initial Data on CPU srand(1); hipblas_init(hx, 1, N, abs_incx); - hipblas_init(hy, 1, N, abs_incy); + hipblas_init(hy_host, 1, N, abs_incy); + hy_device = hy_host; // copy vector is easy in STL; hx_cpu = hx: save a copy in hx_cpu which will be output of CPU BLAS hx_cpu = hx; - hy_cpu = hy; + hy_cpu = hy_host; CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * sizeX, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_host, hy_host.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dy_device, hy_device.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &alpha, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - status = hipblasAxpyFn(handle, N, &alpha, dx, incx, dy, incy); - if(status != HIPBLAS_STATUS_SUCCESS) + status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + status_2 = hipblasAxpyFn(handle, N, d_alpha, dx, incx, dy_device, incy); + + status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); + status_4 = hipblasAxpyFn(handle, N, &alpha, dx, incx, dy_host, incy); + + if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) + || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) { hipblasDestroy(handle); - return status; + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + if(status_3 != HIPBLAS_STATUS_SUCCESS) + return status_3; + if(status_4 != HIPBLAS_STATUS_SUCCESS) + return status_4; } // copy output from device to CPU - CHECK_HIP_ERROR(hipMemcpy(hx.data(), dx, sizeof(T) * sizeX, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hy.data(), dy, sizeof(T) * sizeY, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy_host, sizeof(T) * sizeY, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(hy_device.data(), dy_device, sizeof(T) * sizeY, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ cblas_axpy(N, alpha, hx_cpu.data(), incx, hy_cpu.data(), incy); - // enable unit check, notice unit check is not invasive, but norm check is, - // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, N, abs_incx, hx_cpu.data(), hx.data()); - unit_check_general(1, N, abs_incy, hy_cpu.data(), hy.data()); + unit_check_general(1, N, abs_incx, hy_cpu.data(), hy_host.data()); + unit_check_general(1, N, abs_incy, hy_cpu.data(), hy_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, N, abs_incy, hy_cpu.data(), hy_host.data()); + hipblas_error_device + = norm_check_general('F', 1, N, abs_incy, hy_cpu.data(), hy_device.data()); } } // end of if unit check - // BLAS_1_RESULT_PRINT + if(argus.timing) + { + hipStream_t stream; + status_1 = hipblasGetStream(handle, &stream); + status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + + if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) + { + hipblasDestroy(handle); + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status_1 = hipblasAxpyFn(handle, N, d_alpha, dx, incx, dy_device, incy); + + if(status_1 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status_1; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + axpy_gflop_count(N), + axpy_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; diff --git a/clients/include/testing_axpy_batched.hpp b/clients/include/testing_axpy_batched.hpp index 8c7c3c6f9..2ff4c1e5c 100644 --- a/clients/include/testing_axpy_batched.hpp +++ b/clients/include/testing_axpy_batched.hpp @@ -25,7 +25,10 @@ hipblasStatus_t testing_axpy_batched(const Arguments& argus) int incy = argus.incy; int batch_count = argus.batch_count; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; int abs_incx = incx < 0 ? -incx : incx; int abs_incy = incy < 0 ? -incy : incy; @@ -44,94 +47,136 @@ hipblasStatus_t testing_axpy_batched(const Arguments& argus) int sizeY = N * abs_incy; T alpha = argus.alpha; - double gpu_time_used, cpu_time_used; - double rocblas_error = 0.0; + double gpu_time_used, hipblas_error_host, hipblas_error_device; hipblasHandle_t handle; hipblasCreate(&handle); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - host_vector hx_array[batch_count]; - host_vector hy_array[batch_count]; - host_vector hx_cpu_array[batch_count]; - host_vector hy_cpu_array[batch_count]; - - device_batch_vector bx_array(batch_count, sizeX); - device_batch_vector by_array(batch_count, sizeY); + host_batch_vector hx(N, incx, batch_count); + host_batch_vector hy_host(N, incy, batch_count); + host_batch_vector hy_device(N, incy, batch_count); + host_batch_vector hx_cpu(N, incx, batch_count); + host_batch_vector hy_cpu(N, incy, batch_count); + + device_batch_vector dx(N, incx, batch_count); + device_batch_vector dy_host(N, incy, batch_count); + device_batch_vector dy_device(N, incy, batch_count); + device_vector d_alpha(1); + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy_host.memcheck()); + CHECK_HIP_ERROR(dy_device.memcheck()); + + hipblas_init(hx, true); + hipblas_init(hy_host, false); + hy_device.copy_from(hy_host); + hx_cpu.copy_from(hx); + hy_cpu.copy_from(hy_host); + + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy_host.transfer_from(hy_host)); + CHECK_HIP_ERROR(dy_device.transfer_from(hy_device)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &alpha, sizeof(T), hipMemcpyHostToDevice)); + /* ===================================================================== + HIPBLAS + =================================================================== */ + status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + status_2 = hipblasAxpyBatchedFn( + handle, N, d_alpha, dx.ptr_on_device(), incx, dy_device.ptr_on_device(), incy, batch_count); - device_vector dx_array(batch_count); - device_vector dy_array(batch_count); + status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); + status_4 = hipblasAxpyBatchedFn( + handle, N, &alpha, dx.ptr_on_device(), incx, dy_host.ptr_on_device(), incy, batch_count); - int last = batch_count - 1; - if(!dx_array || !dy_array || (!bx_array[last] && sizeX) || (!by_array[last] && sizeY)) + if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) + || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) { hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + if(status_3 != HIPBLAS_STATUS_SUCCESS) + return status_3; + if(status_4 != HIPBLAS_STATUS_SUCCESS) + return status_4; } - // Initial Data on CPU - srand(1); - for(int b = 0; b < batch_count; b++) - { - hx_array[b] = host_vector(sizeX); - hy_array[b] = host_vector(sizeY); - hx_cpu_array[b] = host_vector(sizeX); - hy_cpu_array[b] = host_vector(sizeY); - - srand(1); - hipblas_init(hx_array[b], 1, N, abs_incx); - hipblas_init(hy_array[b], 1, N, abs_incy); - - hx_cpu_array[b] = hx_array[b]; - hy_cpu_array[b] = hy_array[b]; - - CHECK_HIP_ERROR( - hipMemcpy(bx_array[b], hx_array[b], sizeof(T) * sizeX, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR( - hipMemcpy(by_array[b], hy_array[b], sizeof(T) * sizeY, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR(hipMemcpy(dx_array, bx_array, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy_array, by_array, sizeof(T*) * batch_count, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hy_host.transfer_from(dy_host)); + CHECK_HIP_ERROR(hy_device.transfer_from(dy_device)); - /* ===================================================================== - ROCBLAS - =================================================================== */ - status = hipblasAxpyBatchedFn(handle, N, &alpha, dx_array, incx, dy_array, incy, batch_count); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } - - // copy output from device to CPU - for(int b = 0; b < batch_count; b++) - { - CHECK_HIP_ERROR( - hipMemcpy(hx_array[b], bx_array[b], sizeof(T) * sizeX, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR( - hipMemcpy(hy_array[b], by_array[b], sizeof(T) * sizeY, hipMemcpyDeviceToHost)); - } - - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { - cblas_axpy(N, alpha, hx_cpu_array[b].data(), incx, hy_cpu_array[b].data(), incy); + cblas_axpy(N, alpha, hx_cpu[b], incx, hy_cpu[b], incy); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, N, batch_count, abs_incx, hx_cpu_array, hx_array); - unit_check_general(1, N, batch_count, abs_incy, hy_cpu_array, hy_array); + unit_check_general(1, N, batch_count, abs_incx, hy_cpu, hy_host); + unit_check_general(1, N, batch_count, abs_incy, hy_cpu, hy_device); + } + if(argus.norm_check) + { + norm_check_general('F', 1, N, abs_incy, hy_cpu, hy_host, batch_count); + norm_check_general('F', 1, N, abs_incy, hy_cpu, hy_device, batch_count); } } // end of if unit check + if(argus.timing) + { + hipStream_t stream; + status_1 = hipblasGetStream(handle, &stream); + status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + + if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) + { + hipblasDestroy(handle); + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status_1 = hipblasAxpyBatchedFn(handle, + N, + d_alpha, + dx.ptr_on_device(), + incx, + dy_device.ptr_on_device(), + incy, + batch_count); + + if(status_1 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status_1; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + axpy_gflop_count(N), + axpy_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } + // BLAS_1_RESULT_PRINT hipblasDestroy(handle); diff --git a/clients/include/testing_axpy_strided_batched.hpp b/clients/include/testing_axpy_strided_batched.hpp index f2f301a26..e601e5756 100644 --- a/clients/include/testing_axpy_strided_batched.hpp +++ b/clients/include/testing_axpy_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -35,7 +35,10 @@ hipblasStatus_t testing_axpy_strided_batched(const Arguments& argus) int sizeX = stridex * batch_count; int sizeY = stridey * batch_count; - hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_1 = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_2 = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_3 = HIPBLAS_STATUS_SUCCESS; + hipblasStatus_t status_4 = HIPBLAS_STATUS_SUCCESS; // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -50,15 +53,17 @@ hipblasStatus_t testing_axpy_strided_batched(const Arguments& argus) // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice host_vector hx(sizeX); - host_vector hy(sizeY); + host_vector hy_host(sizeY); + host_vector hy_device(sizeY); host_vector hx_cpu(sizeX); host_vector hy_cpu(sizeY); device_vector dx(sizeX); - device_vector dy(sizeY); + device_vector dy_host(sizeY); + device_vector dy_device(sizeY); + device_vector d_alpha(1); - double gpu_time_used, cpu_time_used; - double rocblas_error = 0.0; + double gpu_time_used, hipblas_error_host, hipblas_error_device; hipblasHandle_t handle; hipblasCreate(&handle); @@ -66,31 +71,50 @@ hipblasStatus_t testing_axpy_strided_batched(const Arguments& argus) // Initial Data on CPU srand(1); hipblas_init(hx, 1, N, abs_incx, stridex, batch_count); - hipblas_init(hy, 1, N, abs_incy, stridey, batch_count); + hipblas_init(hy_host, 1, N, abs_incy, stridey, batch_count); + hy_device = hy_host; // copy vector is easy in STL; hx_cpu = hx: save a copy in hx_cpu which will be output of CPU BLAS hx_cpu = hx; - hy_cpu = hy; + hy_cpu = hy_host; CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * sizeX, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_host, hy_host.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dy_device, hy_device.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &alpha, sizeof(T), hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - status = hipblasAxpyStridedBatchedFn( - handle, N, &alpha, dx, incx, stridex, dy, incy, stridey, batch_count); - if(status != HIPBLAS_STATUS_SUCCESS) + status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + status_2 = hipblasAxpyStridedBatchedFn( + handle, N, d_alpha, dx, incx, stridex, dy_device, incy, stridey, batch_count); + + status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); + status_4 = hipblasAxpyStridedBatchedFn( + handle, N, &alpha, dx, incx, stridex, dy_host, incy, stridey, batch_count); + + if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) + || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) { hipblasDestroy(handle); - return status; + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + if(status_3 != HIPBLAS_STATUS_SUCCESS) + return status_3; + if(status_4 != HIPBLAS_STATUS_SUCCESS) + return status_4; } // copy output from device to CPU - CHECK_HIP_ERROR(hipMemcpy(hx.data(), dx, sizeof(T) * sizeX, hipMemcpyDeviceToHost)); - CHECK_HIP_ERROR(hipMemcpy(hy.data(), dy, sizeof(T) * sizeY, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy_host, sizeof(T) * sizeX, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(hy_device.data(), dy_device, sizeof(T) * sizeY, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -105,13 +129,61 @@ hipblasStatus_t testing_axpy_strided_batched(const Arguments& argus) // unit check and norm check can not be interchanged their order if(argus.unit_check) { - unit_check_general(1, N, batch_count, abs_incx, stridex, hx_cpu.data(), hx.data()); - unit_check_general(1, N, batch_count, abs_incy, stridey, hy_cpu.data(), hy.data()); + unit_check_general( + 1, N, batch_count, abs_incx, stridex, hy_cpu.data(), hy_host.data()); + unit_check_general( + 1, N, batch_count, abs_incy, stridey, hy_cpu.data(), hy_device.data()); + } + if(argus.norm_check) + { + hipblas_error_host = norm_check_general( + 'F', 1, N, 1, stridey, hy_cpu.data(), hy_host.data(), batch_count); + hipblas_error_device = norm_check_general( + 'F', 1, N, 1, stridey, hy_cpu.data(), hy_device.data(), batch_count); } - } // end of if unit check - // BLAS_1_RESULT_PRINT + if(argus.timing) + { + hipStream_t stream; + status_1 = hipblasGetStream(handle, &stream); + status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + + if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) + { + hipblasDestroy(handle); + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status_1 = hipblasAxpyStridedBatchedFn( + handle, N, d_alpha, dx, incx, stridex, dy_device, incy, stridey, batch_count); + + if(status_1 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status_1; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + axpy_gflop_count(N), + axpy_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; diff --git a/clients/include/testing_dot.hpp b/clients/include/testing_dot.hpp index 95fc13ac8..8336007cd 100644 --- a/clients/include/testing_dot.hpp +++ b/clients/include/testing_dot.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -52,57 +52,44 @@ hipblasStatus_t testing_dot(const Arguments& argus) vector hx(sizeX); vector hy(sizeY); - T cpu_result, rocblas_result; - T * dx, *dy, *d_rocblas_result; - int device_pointer = 1; + T cpu_result, h_hipblas_result_1, h_hipblas_result_2; + device_vector dx(sizeX); + device_vector dy(sizeY); + device_vector d_hipblas_result(1); - double gpu_time_used, cpu_time_used; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; hipblasHandle_t handle; hipblasCreate(&handle); - // allocate memory on device - CHECK_HIP_ERROR(hipMalloc(&dx, sizeX * sizeof(T))); - CHECK_HIP_ERROR(hipMalloc(&dy, sizeY * sizeof(T))); - CHECK_HIP_ERROR(hipMalloc(&d_rocblas_result, sizeof(T))); - // Initial Data on CPU srand(1); hipblas_init_alternating_sign(hx, 1, N, incx); hipblas_init(hy, 1, N, incy); // copy data from CPU to device, does not work for incx != 1 - CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * N * incx, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * N * incy, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * sizeX, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS - =================================================================== */ - /* ===================================================================== - CPU BLAS + HIPBLAS =================================================================== */ // hipblasDot accept both dev/host pointer for the scalar - if(device_pointer) - { - - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - status_2 = (hipblasDotFn)(handle, N, dx, incx, dy, incy, d_rocblas_result); - } - else + status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + status_2 = (hipblasDotFn)(handle, N, dx, incx, dy, incy, d_hipblas_result); + if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) { - - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - - status_2 = (hipblasDotFn)(handle, N, dx, incx, dy, incy, &rocblas_result); + hipblasDestroy(handle); + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; } + status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); + status_2 = (hipblasDotFn)(handle, N, dx, incx, dy, incy, &h_hipblas_result_1); if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) { - CHECK_HIP_ERROR(hipFree(dx)); - CHECK_HIP_ERROR(hipFree(dy)); - CHECK_HIP_ERROR(hipFree(d_rocblas_result)); hipblasDestroy(handle); if(status_1 != HIPBLAS_STATUS_SUCCESS) return status_1; @@ -110,9 +97,8 @@ hipblasStatus_t testing_dot(const Arguments& argus) return status_2; } - if(device_pointer) - CHECK_HIP_ERROR( - hipMemcpy(&rocblas_result, d_rocblas_result, sizeof(T), hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(&h_hipblas_result_2, d_hipblas_result, sizeof(T), hipMemcpyDeviceToHost)); if(argus.unit_check || argus.norm_check) { @@ -124,16 +110,60 @@ hipblasStatus_t testing_dot(const Arguments& argus) if(argus.unit_check) { - unit_check_general(1, 1, 1, &cpu_result, &rocblas_result); + unit_check_general(1, 1, 1, &cpu_result, &h_hipblas_result_1); + unit_check_general(1, 1, 1, &cpu_result, &h_hipblas_result_2); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, 1, 1, &cpu_result, &h_hipblas_result_1); + hipblas_error_device + = norm_check_general('F', 1, 1, 1, &cpu_result, &h_hipblas_result_2); } } // end of if unit/norm check - // BLAS_1_RESULT_PRINT + if(argus.timing) + { + hipStream_t stream; + status_1 = hipblasGetStream(handle, &stream); + status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + + if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS)) + { + hipblasDestroy(handle); + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + } - CHECK_HIP_ERROR(hipFree(dx)); - CHECK_HIP_ERROR(hipFree(dy)); - CHECK_HIP_ERROR(hipFree(d_rocblas_result)); + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status_1 = (hipblasDotFn)(handle, N, dx, incx, dy, incy, d_hipblas_result); + + if(status_1 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status_1; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + dot_gflop_count(N), + dot_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } + + // BLAS_1_RESULT_PRINT hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_dot_batched.hpp b/clients/include/testing_dot_batched.hpp index 42c15a281..5eed829ef 100644 --- a/clients/include/testing_dot_batched.hpp +++ b/clients/include/testing_dot_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -45,78 +45,52 @@ hipblasStatus_t testing_dot_batched(const Arguments& argus) int sizeX = N * incx; int sizeY = N * incy; - double gpu_time_used, cpu_time_used; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; hipblasHandle_t handle; hipblasCreate(&handle); // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice - host_vector hx_array[batch_count]; - host_vector hy_array[batch_count]; - host_vector h_cpu_result(batch_count); - host_vector h_rocblas_result1(batch_count); - host_vector h_rocblas_result2(batch_count); - - device_batch_vector bx_array(batch_count, sizeX); - device_batch_vector by_array(batch_count, sizeY); - - device_vector dx_array(batch_count); - device_vector dy_array(batch_count); - device_vector d_rocblas_result(batch_count); - - int device_pointer = 1; - - // TODO: change to 1 when rocBLAS is fixed. - int host_pointer = 0; - - int last = batch_count - 1; - if(!dx_array || !dy_array || !d_rocblas_result || (!bx_array[last] && sizeX) - || (!by_array[last] && sizeY)) - { - hipblasDestroy(handle); - return HIPBLAS_STATUS_ALLOC_FAILED; - } - - // Initial Data on CPU - srand(1); - for(int b = 0; b < batch_count; b++) - { - hx_array[b] = host_vector(sizeX); - hy_array[b] = host_vector(sizeY); - - srand(1); - hipblas_init_alternating_sign(hx_array[b], 1, N, incx); - hipblas_init(hy_array[b], 1, N, incy); - - CHECK_HIP_ERROR( - hipMemcpy(bx_array[b], hx_array[b], sizeof(T) * sizeX, hipMemcpyHostToDevice)); - CHECK_HIP_ERROR( - hipMemcpy(by_array[b], hy_array[b], sizeof(T) * sizeY, hipMemcpyHostToDevice)); - } - CHECK_HIP_ERROR(hipMemcpy(dx_array, bx_array, batch_count * sizeof(T*), hipMemcpyHostToDevice)); - CHECK_HIP_ERROR(hipMemcpy(dy_array, by_array, batch_count * sizeof(T*), hipMemcpyHostToDevice)); + host_batch_vector hx(N, incx, batch_count); + host_batch_vector hy(N, incy, batch_count); + host_vector h_cpu_result(batch_count); + host_vector h_hipblas_result1(batch_count); + host_vector h_hipblas_result2(batch_count); + + device_batch_vector dx(N, incx, batch_count); + device_batch_vector dy(N, incy, batch_count); + device_vector d_hipblas_result(batch_count); + CHECK_HIP_ERROR(dx.memcheck()); + CHECK_HIP_ERROR(dy.memcheck()); + + hipblas_init_alternating_sign(hx, true); + hipblas_init(hy, false); + CHECK_HIP_ERROR(dx.transfer_from(hx)); + CHECK_HIP_ERROR(dy.transfer_from(hy)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ // hipblasDot accept both dev/host pointer for the scalar - if(device_pointer) - { - - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - status_2 = (hipblasDotBatchedFn)( - handle, N, dx_array, incx, dy_array, incy, batch_count, d_rocblas_result); - } - if(host_pointer) - { - - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - - status_3 = (hipblasDotBatchedFn)( - handle, N, dx_array, incx, dy_array, incy, batch_count, h_rocblas_result2); - } + status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + status_2 = (hipblasDotBatchedFn)(handle, + N, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + batch_count, + d_hipblas_result); + + status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); + status_4 = (hipblasDotBatchedFn)(handle, + N, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + batch_count, + h_hipblas_result1); if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) @@ -132,9 +106,8 @@ hipblasStatus_t testing_dot_batched(const Arguments& argus) return status_4; } - if(device_pointer) - CHECK_HIP_ERROR(hipMemcpy( - h_rocblas_result1, d_rocblas_result, sizeof(T) * batch_count, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy( + h_hipblas_result2, d_hipblas_result, sizeof(T) * batch_count, hipMemcpyDeviceToHost)); if(argus.unit_check || argus.norm_check) { @@ -143,19 +116,69 @@ hipblasStatus_t testing_dot_batched(const Arguments& argus) =================================================================== */ for(int b = 0; b < batch_count; b++) { - (CONJ ? cblas_dotc - : cblas_dot)(N, hx_array[b], incx, hy_array[b], incy, &(h_cpu_result[b])); + (CONJ ? cblas_dotc : cblas_dot)(N, hx[b], incx, hy[b], incy, &(h_cpu_result[b])); } if(argus.unit_check) { - unit_check_general(1, batch_count, 1, h_cpu_result, h_rocblas_result1); - // unit_check_general(1, batch_count, 1, h_cpu_result, h_rocblas_result2); + unit_check_general(1, batch_count, 1, h_cpu_result, h_hipblas_result1); + unit_check_general(1, batch_count, 1, h_cpu_result, h_hipblas_result2); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, batch_count, 1, h_cpu_result, h_hipblas_result1); + hipblas_error_device + = norm_check_general('F', 1, batch_count, 1, h_cpu_result, h_hipblas_result2); } } // end of if unit/norm check - // BLAS_1_RESULT_PRINT + if(argus.timing) + { + hipStream_t stream; + status_1 = hipblasGetStream(handle, &stream); + status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + if(status_1 != HIPBLAS_STATUS_SUCCESS || status_2 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status_1 = (hipblasDotBatchedFn)(handle, + N, + dx.ptr_on_device(), + incx, + dy.ptr_on_device(), + incy, + batch_count, + d_hipblas_result); + + if(status_1 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status_1; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + dot_gflop_count(N), + dot_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; diff --git a/clients/include/testing_dot_strided_batched.hpp b/clients/include/testing_dot_strided_batched.hpp index 647961d12..c71a3550e 100644 --- a/clients/include/testing_dot_strided_batched.hpp +++ b/clients/include/testing_dot_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -52,21 +52,15 @@ hipblasStatus_t testing_dot_strided_batched(const Arguments& argus) // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice host_vector hx(sizeX); host_vector hy(sizeY); - host_vector h_rocblas_result1(batch_count); - host_vector h_rocblas_result2(batch_count); + host_vector h_hipblas_result1(batch_count); + host_vector h_hipblas_result2(batch_count); host_vector h_cpu_result(batch_count); device_vector dx(sizeX); device_vector dy(sizeY); - device_vector d_rocblas_result(batch_count); + device_vector d_hipblas_result(batch_count); - int device_pointer = 1; - - // TODO: Change to 1 when rocBLAS is fixed. - int host_pointer = 0; - - double gpu_time_used, cpu_time_used; - double rocblas_error; + double gpu_time_used, hipblas_error_host, hipblas_error_device; hipblasHandle_t handle; hipblasCreate(&handle); @@ -81,25 +75,16 @@ hipblasStatus_t testing_dot_strided_batched(const Arguments& argus) CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ // hipblasDot accept both dev/host pointer for the scalar - if(device_pointer) - { - - status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); - - status_2 = (hipblasDotStridedBatchedFn)( - handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, d_rocblas_result); - } - if(host_pointer) - { + status_1 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + status_2 = (hipblasDotStridedBatchedFn)( + handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, d_hipblas_result); - status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); - - status_4 = (hipblasDotStridedBatchedFn)( - handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, h_rocblas_result2); - } + status_3 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); + status_4 = (hipblasDotStridedBatchedFn)( + handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, h_hipblas_result1); if((status_1 != HIPBLAS_STATUS_SUCCESS) || (status_2 != HIPBLAS_STATUS_SUCCESS) || (status_3 != HIPBLAS_STATUS_SUCCESS) || (status_4 != HIPBLAS_STATUS_SUCCESS)) @@ -115,9 +100,8 @@ hipblasStatus_t testing_dot_strided_batched(const Arguments& argus) return status_4; } - if(device_pointer) - CHECK_HIP_ERROR(hipMemcpy( - h_rocblas_result1, d_rocblas_result, sizeof(T) * batch_count, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy( + h_hipblas_result2, d_hipblas_result, sizeof(T) * batch_count, hipMemcpyDeviceToHost)); if(argus.unit_check || argus.norm_check) { @@ -136,13 +120,59 @@ hipblasStatus_t testing_dot_strided_batched(const Arguments& argus) if(argus.unit_check) { - unit_check_general(1, batch_count, 1, h_cpu_result, h_rocblas_result1); - // unit_check_general(1, batch_count, 1, h_cpu_result, h_rocblas_result2); + unit_check_general(1, batch_count, 1, h_cpu_result, h_hipblas_result1); + unit_check_general(1, batch_count, 1, h_cpu_result, h_hipblas_result2); + } + if(argus.norm_check) + { + hipblas_error_host + = norm_check_general('F', 1, batch_count, 1, h_cpu_result, h_hipblas_result1); + hipblas_error_device + = norm_check_general('F', 1, batch_count, 1, h_cpu_result, h_hipblas_result2); } } // end of if unit/norm check - // BLAS_1_RESULT_PRINT + if(argus.timing) + { + hipStream_t stream; + status_1 = hipblasGetStream(handle, &stream); + status_2 = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE); + if(status_1 != HIPBLAS_STATUS_SUCCESS || status_2 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + if(status_1 != HIPBLAS_STATUS_SUCCESS) + return status_1; + if(status_2 != HIPBLAS_STATUS_SUCCESS) + return status_2; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status_1 = (hipblasDotStridedBatchedFn)( + handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, d_hipblas_result); + + if(status_1 != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status_1; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + dot_gflop_count(N), + dot_gbyte_count(N), + hipblas_error_host, + hipblas_error_device); + } hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; diff --git a/clients/include/testing_gemv.hpp b/clients/include/testing_gemv.hpp index d4db9d87d..0983e289c 100644 --- a/clients/include/testing_gemv.hpp +++ b/clients/include/testing_gemv.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -90,8 +90,7 @@ hipblasStatus_t testing_gemv(const Arguments& argus) T alpha = (T)argus.alpha; T beta = (T)argus.beta; - hipblasHandle_t handle; - hipblasCreate(&handle); + hipblasLocalHandle handle(argus); // Initial Data on CPU srand(1); @@ -113,14 +112,8 @@ hipblasStatus_t testing_gemv(const Arguments& argus) if(argus.unit_check || argus.norm_check) { - status = hipblasGemvFn( - handle, transA, M, N, (T*)&alpha, dA, lda, dx, incx, (T*)&beta, dy, incy); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGemvFn( + handle, transA, M, N, (T*)&alpha, dA, lda, dx, incx, (T*)&beta, dy, incy)); /* ===================================================================== CPU BLAS @@ -146,12 +139,8 @@ hipblasStatus_t testing_gemv(const Arguments& argus) if(argus.timing) { hipStream_t stream; - status = hipblasGetStream(handle, &stream); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); + int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) { @@ -160,14 +149,8 @@ hipblasStatus_t testing_gemv(const Arguments& argus) gpu_time_used = get_time_us_sync(stream); } - status = hipblasGemvFn( - handle, transA, M, N, (T*)&alpha, dA, lda, dx, incx, (T*)&beta, dy, incy); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + CHECK_HIPBLAS_ERROR(hipblasGemvFn( + handle, transA, M, N, (T*)&alpha, dA, lda, dx, incx, (T*)&beta, dy, incy)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; @@ -180,6 +163,5 @@ hipblasStatus_t testing_gemv(const Arguments& argus) rocblas_error); } - hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; } diff --git a/clients/include/testing_scal_batched_ex.hpp b/clients/include/testing_scal_batched_ex.hpp index 9083f8f49..4851d0253 100644 --- a/clients/include/testing_scal_batched_ex.hpp +++ b/clients/include/testing_scal_batched_ex.hpp @@ -14,7 +14,7 @@ using namespace std; /* ============================================================================================ */ template -hipblasStatus_t testing_scal_batched_ex_template(Arguments argus) +hipblasStatus_t testing_scal_batched_ex_template(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasScalBatchedExFn = FORTRAN ? hipblasScalBatchedExFortran : hipblasScalBatchedEx; @@ -22,11 +22,14 @@ hipblasStatus_t testing_scal_batched_ex_template(Arguments argus) int N = argus.N; int incx = argus.incx; int batch_count = argus.batch_count; + int unit_check = argus.unit_check; + int timing = argus.timing; + int norm_check = argus.norm_check; hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; int sizeX = N * incx; - Ta alpha = argus.alpha; + Ta alpha = argus.get_alpha(); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -59,8 +62,8 @@ hipblasStatus_t testing_scal_batched_ex_template(Arguments argus) return HIPBLAS_STATUS_ALLOC_FAILED; } - double gpu_time_used, cpu_time_used; - double rocblas_error = 0.0; + double gpu_time_used = 0.0, cpu_time_used = 0.0; + double hipblas_error = 0.0; hipblasHandle_t handle; hipblasCreate(&handle); @@ -85,6 +88,7 @@ hipblasStatus_t testing_scal_batched_ex_template(Arguments argus) =================================================================== */ status = hipblasScalBatchedExFn( handle, N, &alpha, alphaType, dx, xType, incx, batch_count, executionType); + if(status != HIPBLAS_STATUS_SUCCESS) { hipblasDestroy(handle); @@ -97,7 +101,7 @@ hipblasStatus_t testing_scal_batched_ex_template(Arguments argus) CHECK_HIP_ERROR(hipMemcpy(hx[b], bx[b], sizeof(Tx) * sizeX, hipMemcpyDeviceToHost)); } - if(argus.unit_check) + if(unit_check || norm_check) { /* ===================================================================== CPU BLAS @@ -109,20 +113,58 @@ hipblasStatus_t testing_scal_batched_ex_template(Arguments argus) // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order - if(argus.unit_check) + if(unit_check) { unit_check_general(1, N, batch_count, incx, hz, hx); } + if(norm_check) + { + hipblas_error = norm_check_general('F', 1, N, incx, hz, hx, batch_count); + } + } // end of if unit check - // BLAS_1_RESULT_PRINT + if(timing) + { + hipStream_t stream; + status = hipblasGetStream(handle, &stream); + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status = hipblasScalBatchedExFn( + handle, N, &alpha, alphaType, dx, xType, incx, batch_count, executionType); + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + scal_gflop_count(N), + scal_gbyte_count(N), + hipblas_error); + } hipblasDestroy(handle); - return HIPBLAS_STATUS_SUCCESS; + return status; } -hipblasStatus_t testing_scal_batched_ex(Arguments argus) +hipblasStatus_t testing_scal_batched_ex(const Arguments& argus) { hipblasDatatype_t alphaType = argus.a_type; hipblasDatatype_t xType = argus.b_type; diff --git a/clients/include/testing_scal_ex.hpp b/clients/include/testing_scal_ex.hpp index 493f81fb4..50f38dc45 100644 --- a/clients/include/testing_scal_ex.hpp +++ b/clients/include/testing_scal_ex.hpp @@ -14,18 +14,21 @@ using namespace std; /* ============================================================================================ */ template -hipblasStatus_t testing_scal_ex_template(Arguments argus) +hipblasStatus_t testing_scal_ex_template(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasScalExFn = FORTRAN ? hipblasScalExFortran : hipblasScalEx; - int N = argus.N; - int incx = argus.incx; + int N = argus.N; + int incx = argus.incx; + int unit_check = argus.unit_check; + int timing = argus.timing; + int norm_check = argus.norm_check; hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; int sizeX = N * incx; - Ta alpha = argus.alpha; + Ta alpha = argus.get_alpha(); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -43,8 +46,8 @@ hipblasStatus_t testing_scal_ex_template(Arguments argus) host_vector hz(sizeX); device_vector dx(sizeX); - double gpu_time_used, cpu_time_used; - double rocblas_error = 0.0; + double gpu_time_used = 0.0, cpu_time_used = 0.0; + double hipblas_error = 0.0; hipblasHandle_t handle; hipblasCreate(&handle); @@ -72,7 +75,7 @@ hipblasStatus_t testing_scal_ex_template(Arguments argus) // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hx.data(), dx, sizeof(Tx) * N * incx, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(unit_check || norm_check) { /* ===================================================================== CPU BLAS @@ -81,20 +84,56 @@ hipblasStatus_t testing_scal_ex_template(Arguments argus) // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order - if(argus.unit_check) + if(unit_check) { unit_check_general(1, N, incx, hz.data(), hx.data()); } + if(norm_check) + { + hipblas_error = norm_check_general('F', 1, N, incx, hz.data(), hx.data()); + } + } // end of if unit check - // BLAS_1_RESULT_PRINT + if(timing) + { + hipStream_t stream; + status = hipblasGetStream(handle, &stream); + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status = hipblasScalExFn(handle, N, &alpha, alphaType, dx, xType, incx, executionType); + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args(std::cout, + argus, + gpu_time_used, + scal_gflop_count(N), + scal_gbyte_count(N), + hipblas_error); + } hipblasDestroy(handle); - return HIPBLAS_STATUS_SUCCESS; + return status; } -hipblasStatus_t testing_scal_ex(Arguments argus) +hipblasStatus_t testing_scal_ex(const Arguments& argus) { hipblasDatatype_t alphaType = argus.a_type; hipblasDatatype_t xType = argus.b_type; diff --git a/clients/include/testing_scal_strided_batched_ex.hpp b/clients/include/testing_scal_strided_batched_ex.hpp index 3405319fa..f1b8efb53 100644 --- a/clients/include/testing_scal_strided_batched_ex.hpp +++ b/clients/include/testing_scal_strided_batched_ex.hpp @@ -14,7 +14,7 @@ using namespace std; /* ============================================================================================ */ template -hipblasStatus_t testing_scal_strided_batched_ex_template(Arguments argus) +hipblasStatus_t testing_scal_strided_batched_ex_template(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasScalStridedBatchedExFn @@ -24,13 +24,16 @@ hipblasStatus_t testing_scal_strided_batched_ex_template(Arguments argus) int incx = argus.incx; double stride_scale = argus.stride_scale; int batch_count = argus.batch_count; + int unit_check = argus.unit_check; + int timing = argus.timing; + int norm_check = argus.norm_check; hipblasStride stridex = N * incx * stride_scale; int sizeX = stridex * batch_count; hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; - Ta alpha = argus.alpha; + Ta alpha = argus.get_alpha(); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory @@ -52,8 +55,8 @@ hipblasStatus_t testing_scal_strided_batched_ex_template(Arguments argus) host_vector hz(sizeX); device_vector dx(sizeX); - double gpu_time_used, cpu_time_used; - double rocblas_error = 0.0; + double gpu_time_used = 0.0, cpu_time_used = 0.0; + double hipblas_error = 0.0; hipblasHandle_t handle; hipblasCreate(&handle); @@ -82,7 +85,7 @@ hipblasStatus_t testing_scal_strided_batched_ex_template(Arguments argus) // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hx.data(), dx, sizeof(Tx) * sizeX, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(unit_check || norm_check) { /* ===================================================================== CPU BLAS @@ -94,20 +97,58 @@ hipblasStatus_t testing_scal_strided_batched_ex_template(Arguments argus) // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order - if(argus.unit_check) + if(unit_check) { unit_check_general(1, N, batch_count, incx, stridex, hz, hx); } + if(norm_check) + { + hipblas_error = norm_check_general('F', 1, N, incx, stridex, hz, hx, batch_count); + } + } // end of if unit check - // BLAS_1_RESULT_PRINT + if(timing) + { + hipStream_t stream; + status = hipblasGetStream(handle, &stream); + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status = hipblasScalStridedBatchedExFn( + handle, N, &alpha, alphaType, dx, xType, incx, stridex, batch_count, executionType); + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{}.log_args( + std::cout, + argus, + gpu_time_used, + scal_gflop_count(N), + scal_gbyte_count(N), + hipblas_error); + } hipblasDestroy(handle); - return HIPBLAS_STATUS_SUCCESS; + return status; } -hipblasStatus_t testing_scal_strided_batched_ex(Arguments argus) +hipblasStatus_t testing_scal_strided_batched_ex(const Arguments& argus) { hipblasDatatype_t alphaType = argus.a_type; hipblasDatatype_t xType = argus.b_type; diff --git a/clients/include/testing_trmm.hpp b/clients/include/testing_trmm.hpp index a2bfde923..237a877eb 100644 --- a/clients/include/testing_trmm.hpp +++ b/clients/include/testing_trmm.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -29,7 +29,7 @@ hipblasStatus_t testing_trmm(const Arguments& argus) char char_uplo = argus.uplo_option; char char_transA = argus.transA_option; char char_diag = argus.diag_option; - T alpha = argus.alpha; + T alpha = argus.get_alpha(); hipblasSideMode_t side = char2hipblas_side(char_side); hipblasFillMode_t uplo = char2hipblas_fill(char_uplo); @@ -55,8 +55,8 @@ hipblasStatus_t testing_trmm(const Arguments& argus) device_vector dA(A_size); device_vector dB(B_size); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops; + double gpu_time_used = 0.0; + double hipblas_error = 0.0; hipblasHandle_t handle; hipblasCreate(&handle); @@ -72,7 +72,7 @@ hipblasStatus_t testing_trmm(const Arguments& argus) CHECK_HIP_ERROR(hipMemcpy(dB, hB.data(), sizeof(T) * B_size, hipMemcpyHostToDevice)); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ status = hipblasTrmmFn(handle, side, uplo, transA, diag, M, N, &alpha, dA, lda, dB, ldb); @@ -85,7 +85,7 @@ hipblasStatus_t testing_trmm(const Arguments& argus) // copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hB.data(), dB, sizeof(T) * B_size, hipMemcpyDeviceToHost)); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -99,6 +99,53 @@ hipblasStatus_t testing_trmm(const Arguments& argus) { unit_check_general(M, N, ldb, hB_copy.data(), hB.data()); } + if(argus.norm_check) + { + hipblas_error = norm_check_general('F', M, N, ldb, hB_copy.data(), hB.data()); + } + } + + if(argus.timing) + { + hipStream_t stream; + status = hipblasGetStream(handle, &stream); + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status + = hipblasTrmmFn(handle, side, uplo, transA, diag, M, N, &alpha, dA, lda, dB, ldb); + + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + trmm_gflop_count(M, N, K), + trmm_gbyte_count(M, N, K), + hipblas_error); } hipblasDestroy(handle); diff --git a/clients/include/testing_trmm_batched.hpp b/clients/include/testing_trmm_batched.hpp index 4af2dd2dd..d017f1897 100644 --- a/clients/include/testing_trmm_batched.hpp +++ b/clients/include/testing_trmm_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -52,9 +52,8 @@ hipblasStatus_t testing_trmm_batched(const Arguments& argus) hipblasHandle_t handle; hipblasCreate(&handle); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used = 0.0; + double hipblas_error = 0.0; T alpha = argus.get_alpha(); T beta = argus.get_beta(); @@ -114,29 +113,15 @@ hipblasStatus_t testing_trmm_batched(const Arguments& argus) } /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) - { - status = hipblasTrmmBatchedFn(handle, - side, - uplo, - transA, - diag, - M, - N, - &alpha, - dA_array, - lda, - dB_array, - ldb, - batch_count); + status = hipblasTrmmBatchedFn( + handle, side, uplo, transA, diag, M, N, &alpha, dA_array, lda, dB_array, ldb, batch_count); - if(status != HIPBLAS_STATUS_SUCCESS) - { - hipblasDestroy(handle); - return status; - } + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; } // copy output from device to CPU @@ -145,7 +130,7 @@ hipblasStatus_t testing_trmm_batched(const Arguments& argus) hipMemcpy(hB_copy_array[b], bB_array[b], sizeof(T) * B_size, hipMemcpyDeviceToHost); } - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -163,6 +148,66 @@ hipblasStatus_t testing_trmm_batched(const Arguments& argus) { unit_check_general(M, N, batch_count, ldb, hB_array, hB_copy_array); } + if(argus.norm_check) + { + hipblas_error + = norm_check_general('F', M, N, ldb, hB_copy_array, hB_array, batch_count); + } + } + + if(argus.timing) + { + hipStream_t stream; + status = hipblasGetStream(handle, &stream); + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status = hipblasTrmmBatchedFn(handle, + side, + uplo, + transA, + diag, + M, + N, + &alpha, + dA_array, + lda, + dB_array, + ldb, + batch_count); + + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + trmm_gflop_count(M, N, K), + trmm_gbyte_count(M, N, K), + hipblas_error); } hipblasDestroy(handle); diff --git a/clients/include/testing_trmm_strided_batched.hpp b/clients/include/testing_trmm_strided_batched.hpp index df63a639c..5546c6d06 100644 --- a/clients/include/testing_trmm_strided_batched.hpp +++ b/clients/include/testing_trmm_strided_batched.hpp @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ @@ -57,9 +57,8 @@ hipblasStatus_t testing_trmm_strided_batched(const Arguments& argus) device_vector dA(A_size); device_vector dB(B_size); - double gpu_time_used, cpu_time_used; - double hipblasGflops, cblas_gflops, hipblasBandwidth; - double rocblas_error; + double gpu_time_used = 0.0; + double hipblas_error = 0.0; T alpha = argus.get_alpha(); T beta = argus.get_beta(); @@ -78,38 +77,35 @@ hipblasStatus_t testing_trmm_strided_batched(const Arguments& argus) hipMemcpy(dB, hB.data(), sizeof(T) * B_size, hipMemcpyHostToDevice); /* ===================================================================== - ROCBLAS + HIPBLAS =================================================================== */ - for(int iter = 0; iter < 1; iter++) + status = hipblasTrmmStridedBatchedFn(handle, + side, + uplo, + transA, + diag, + M, + N, + &alpha, + dA, + lda, + stride_A, + dB, + ldb, + stride_B, + batch_count); + + if(status != HIPBLAS_STATUS_SUCCESS) { - status = hipblasTrmmStridedBatchedFn(handle, - side, - uplo, - transA, - diag, - M, - N, - &alpha, - dA, - lda, - stride_A, - dB, - ldb, - stride_B, - batch_count); - - if(status != HIPBLAS_STATUS_SUCCESS) - { - // here in cuda - hipblasDestroy(handle); - return status; - } + // here in cuda + hipblasDestroy(handle); + return status; } // copy output from device to CPU hipMemcpy(hB_copy.data(), dB, sizeof(T) * B_size, hipMemcpyDeviceToHost); - if(argus.unit_check) + if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU BLAS @@ -136,6 +132,69 @@ hipblasStatus_t testing_trmm_strided_batched(const Arguments& argus) { unit_check_general(M, N, batch_count, ldb, stride_B, hB, hB_copy); } + if(argus.norm_check) + { + hipblas_error = norm_check_general( + 'F', M, N, ldb, stride_B, hB_copy.data(), hB.data(), batch_count); + } + } + + if(argus.timing) + { + hipStream_t stream; + status = hipblasGetStream(handle, &stream); + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + + int runs = argus.cold_iters + argus.iters; + for(int iter = 0; iter < runs; iter++) + { + if(iter == argus.cold_iters) + gpu_time_used = get_time_us_sync(stream); + + status = hipblasTrmmStridedBatchedFn(handle, + side, + uplo, + transA, + diag, + M, + N, + &alpha, + dA, + lda, + stride_A, + dB, + ldb, + stride_B, + batch_count); + + if(status != HIPBLAS_STATUS_SUCCESS) + { + hipblasDestroy(handle); + return status; + } + } + gpu_time_used = get_time_us_sync(stream) - gpu_time_used; + + ArgumentModel{} + .log_args(std::cout, + argus, + gpu_time_used, + trmm_gflop_count(M, N, K), + trmm_gbyte_count(M, N, K), + hipblas_error); } hipblasDestroy(handle); diff --git a/clients/include/type_dispatch.hpp b/clients/include/type_dispatch.hpp index 40caa02a7..8af81d7bf 100644 --- a/clients/include/type_dispatch.hpp +++ b/clients/include/type_dispatch.hpp @@ -89,12 +89,20 @@ auto hipblas_blas1_ex_dispatch(const Arguments& arg) // Currently for axpy we're only supporting a limited number of variants, // specifically alpha_type == x_type == y_type, however I'm trying to leave // this open to expansion. - const auto Ta = arg.a_type, Tx = arg.b_type, Ty = arg.c_type, Tex = arg.compute_type; + const auto Ta = arg.a_type, Tx = arg.b_type, Ty = arg.c_type, Tex = arg.compute_type; + const std::string function = arg.function; + const bool is_scal = function == "scal_ex" || function == "scal_batched_ex" + || function == "scal_strided_batched_ex"; if(Ta == Tx && Tx == Ty && Ty == Tex) { return hipblas_simple_dispatch(arg); // Ta == Tx == Ty == Tex } + else if(Ta == Tx && Tx == Tex && is_scal) + { + // hscal with f16_r compute (scal doesn't care about Ty) + return hipblas_simple_dispatch(arg); + } else if(Ta == Tx && Tx == Ty && Ta == HIPBLAS_R_16F && Tex == HIPBLAS_R_32F) { return TEST{}(arg); @@ -104,6 +112,11 @@ auto hipblas_blas1_ex_dispatch(const Arguments& arg) // scal half return TEST{}(arg); } + else if(Ta == HIPBLAS_R_32F && Tx == HIPBLAS_R_16F && Tex == HIPBLAS_R_32F && is_scal) + { + // scal half with float alpha + return TEST{}(arg); + } else if(Ta == HIPBLAS_R_32F && Tx == HIPBLAS_C_32F && Tex == HIPBLAS_C_32F) { // csscal diff --git a/clients/include/utility.h b/clients/include/utility.h index c9f43f487..b8c5b15c4 100644 --- a/clients/include/utility.h +++ b/clients/include/utility.h @@ -45,6 +45,15 @@ #ifdef __cplusplus +#ifndef CHECK_HIPBLAS_ERROR +#define CHECK_HIPBLAS_ERROR(error) \ + if(error != HIPBLAS_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "hipBLAS error: %s\n", hipblasStatusToString(error)); \ + return (error); \ + } +#endif + #define BLAS_1_RESULT_PRINT \ do \ { \ @@ -690,6 +699,38 @@ double get_time_us_sync(hipStream_t stream); #ifdef __cplusplus +struct Arguments; + +/* ============================================================================================ */ +/*! \brief local handle which is automatically created and destroyed */ +class hipblasLocalHandle +{ + hipblasHandle_t m_handle; + void* m_memory = nullptr; + +public: + hipblasLocalHandle(); + + explicit hipblasLocalHandle(const Arguments& arg); + + ~hipblasLocalHandle(); + + hipblasLocalHandle(const hipblasLocalHandle&) = delete; + hipblasLocalHandle(hipblasLocalHandle&&) = delete; + hipblasLocalHandle& operator=(const hipblasLocalHandle&) = delete; + hipblasLocalHandle& operator=(hipblasLocalHandle&&) = delete; + + // Allow hipblasLocalHandle to be used anywhere hipblas_handle is expected + operator hipblasHandle_t&() + { + return m_handle; + } + operator const hipblasHandle_t&() const + { + return m_handle; + } +}; + #include "hipblas_arguments.hpp" #endif // __cplusplus diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index fe631f56d..20a0f29e7 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -61,8 +61,8 @@ add_subdirectory( src ) # Package specific CPACK vars if( NOT USE_CUDA ) - set( CPACK_DEBIAN_PACKAGE_DEPENDS "rocblas (>= 2.38.0), rocsolver (>= 3.12.0)" ) - set( CPACK_RPM_PACKAGE_REQUIRES "rocblas >= 2.38.0, rocsolver >= 3.12.0" ) + set( CPACK_DEBIAN_PACKAGE_DEPENDS "rocblas (>= 2.39.0), rocsolver (>= 3.12.0)" ) + set( CPACK_RPM_PACKAGE_REQUIRES "rocblas >= 2.39.0, rocsolver >= 3.12.0" ) endif( ) set( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/../LICENSE.md" ) diff --git a/library/include/hipblas.h b/library/include/hipblas.h index c71df5f92..285d33c80 100644 --- a/library/include/hipblas.h +++ b/library/include/hipblas.h @@ -1,5 +1,5 @@ /* ************************************************************************ - * Copyright 2016-2020 Advanced Micro Devices, Inc. + * Copyright 2016-2021 Advanced Micro Devices, Inc. * ************************************************************************ */ //! HIP = Heterogeneous-compute Interface for Portability @@ -7409,6 +7409,20 @@ HIPBLAS_EXPORT hipblasStatus_t hipblasScalStridedBatchedEx(hipblasHandle_t han int batch_count, hipblasDatatype_t executionType); +/*! HIPBLAS Auxiliary API + + \details + hipblasStatusToString + + Returns string representing hipblasStatus_t value + + @param[in] + status [hipblasStatus_t] + hipBLAS status to convert to string +*/ + +HIPBLAS_EXPORT const char* hipblasStatusToString(hipblasStatus_t status); + #ifdef __cplusplus } #endif diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 4583db291..1d92640eb 100755 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -1,5 +1,5 @@ # ######################################################################## -# Copyright 2016-2020 Advanced Micro Devices, Inc. +# Copyright 2016-2021 Advanced Micro Devices, Inc. # ######################################################################## # ######################################################################## @@ -36,6 +36,7 @@ add_library(hipblas_fortran ${hipblas_f90_source}) add_library( hipblas ${hipblas_source} + ${CMAKE_CURRENT_SOURCE_DIR}/hipblas_auxiliary.cpp ${relative_hipblas_headers_public} ) add_library( roc::hipblas ALIAS hipblas ) diff --git a/library/src/hipblas_auxiliary.cpp b/library/src/hipblas_auxiliary.cpp new file mode 100644 index 000000000..9368cec0d --- /dev/null +++ b/library/src/hipblas_auxiliary.cpp @@ -0,0 +1,29 @@ +/* ************************************************************************ + * Copyright 2021 Advanced Micro Devices, Inc. + * ************************************************************************ */ +#include + +// Convert hipblas_status to string +extern "C" const char* hipblasStatusToString(hipblasStatus_t status) +{ +#define CASE(x) \ + case x: \ + return #x + switch(status) + { + CASE(HIPBLAS_STATUS_SUCCESS); + CASE(HIPBLAS_STATUS_NOT_INITIALIZED); + CASE(HIPBLAS_STATUS_ALLOC_FAILED); + CASE(HIPBLAS_STATUS_INVALID_VALUE); + CASE(HIPBLAS_STATUS_MAPPING_ERROR); + CASE(HIPBLAS_STATUS_EXECUTION_FAILED); + CASE(HIPBLAS_STATUS_INTERNAL_ERROR); + CASE(HIPBLAS_STATUS_NOT_SUPPORTED); + CASE(HIPBLAS_STATUS_ARCH_MISMATCH); + CASE(HIPBLAS_STATUS_HANDLE_IS_NULLPTR); + } +#undef CASE + // We don't use default: so that the compiler warns us if any valid enums are missing + // from our switch. If the value is not a valid hipblas_status, we return this string. + return ""; +}