Merge pull request #520 from daineAMD/master

Merge staging into master for ROCm 5.4
ROCm · Sep 14, 2022 · b80975b · b80975b
2 parents 587bfbe + 5f9a3b3
commit b80975b
Show file tree

Hide file tree

Showing 11 changed files with 209 additions and 26 deletions.
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -0,0 +1,58 @@
+name: Upload to the upload server
+
+# Controls when the workflow will run
+on:
+  push:
+    branches: [develop, master]
+    tags:
+      - rocm-5.*
+  release:
+    types: [published]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      - name: getting branch name
+        shell: bash
+        run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+        id: branch_name
+      - name: getting tag name
+        shell: bash
+        run: echo "##[set-output name=tag;]$(echo ${GITHUB_REF_NAME})"
+        id: tag_name
+      - name: zipping files
+        run: zip -r ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip . -x '*.git*' '*.idea*'
+      - name: echo-step
+        run: echo "${{ github.event.release.target_commitish }}"
+      - name: uploading archive to prod
+        if: ${{ steps.branch_name.outputs.branch == 'master' || github.event.release.target_commitish == 'master'}}
+        uses: wlixcc/[email protected]
+        with:
+          username: ${{ secrets.USERNAME }}
+          server: ${{ secrets.SERVER }}
+          ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }}
+          local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip
+          remote_path: '${{ secrets.PROD_UPLOAD_URL }}'
+          args: '-o ConnectTimeout=5'
+      - name: uploading archive to staging
+        if: ${{ steps.branch_name.outputs.branch == 'develop' || github.event.release.target_commitish == 'develop' }}
+        uses: wlixcc/[email protected]
+        with:
+          username: ${{ secrets.USERNAME }}
+          server: ${{ secrets.SERVER }}
+          ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }}
+          local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip
+          remote_path: '${{ secrets.STG_UPLOAD_URL }}'
+          args: '-o ConnectTimeout=5'
diff --git a/.jenkins/staticanalysis.groovy b/.jenkins/staticanalysis.groovy
@@ -29,6 +29,6 @@ ci: {
 
     properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 2')])]))
     stage(urlJobName) {
-        runCI([ubuntu18:['any']], urlJobName)
+        runCI([ubuntu20:['any']], urlJobName)
     }
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Change Log for hipBLAS
 
+## (Unreleased) hipBLAS 0.53.0
+### Added
+- Allow for selection of int8 datatype
+- Added support for hipblasXgels and hipblasXgelsStridedBatched operations (with s,d,c,z precisions),
+  only supported with rocBLAS backend
+- Added support for hipblasXgelsBatched operations (with s,d,c,z precisions)
+
 ## (Unreleased) hipBLAS 0.52.0
 ### Added
 - Added --cudapath option to install.sh to allow user to specify which cuda build they would like to use.

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -158,11 +158,15 @@ if( BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCHMARKS )
     string(TOLOWER "${CLIENTS_OS}" CLIENTS_OS)
     rocm_read_os_release(CLIENTS_OS_VERSION VERSION_ID)
   endif()
-  set(GFORTRAN_PKG "gcc-gfortran")
-  if(CLIENTS_OS STREQUAL "sles")
-    set(GFORTRAN_PKG "gcc-fortran")
-  elseif(CLIENTS_OS STREQUAL "centos" AND CLIENTS_OS_VERSION EQUAL 7)
-    set(GFORTRAN_PKG "devtoolset-7-gcc-gfortran")
+  message(STATUS "OS: ${CLIENTS_OS} ${CLIENTS_OS_VERSION}")
+  set(GFORTRAN_RPM "libgfortran4")
+  set(GFORTRAN_DEB "libgfortran4")
+  if(CLIENTS_OS STREQUAL "centos" OR CLIENTS_OS STREQUAL "rhel")
+    if(CLIENTS_OS_VERSION VERSION_GREATER_EQUAL "8")
+      set(GFORTRAN_RPM "libgfortran")
+    endif()
+  elseif(CLIENTS_OS STREQUAL "ubuntu" AND CLIENTS_OS_VERSION VERSION_GREATER_EQUAL "20.04")
+    set(GFORTRAN_DEB "libgfortran5")
   endif()
   rocm_package_setup_component(clients)
   rocm_package_setup_client_component(clients-common)
@@ -171,16 +175,16 @@ if( BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCHMARKS )
       tests
       DEPENDS
         COMPONENT clients-common
-        DEB "gfortran"
-        RPM "${GFORTRAN_PKG}")
+        DEB "${GFORTRAN_DEB}"
+        RPM "${GFORTRAN_RPM}")
   endif()
   if(BUILD_CLIENTS_BENCHMARKS)
     rocm_package_setup_client_component(
       benchmarks
       DEPENDS
         COMPONENT clients-common
-        DEB "gfortran"
-        RPM "${GFORTRAN_PKG}")
+        DEB "${GFORTRAN_DEB}"
+        RPM "${GFORTRAN_RPM}")
   endif()
   add_subdirectory( clients )
 endif( )

diff --git a/bump_staging_version.sh b/bump_staging_version.sh
@@ -5,14 +5,14 @@
 # - run this script in master branch
 # - after running this script merge master into develop
 
-OLD_HIPBLAS_VERSION="0.52.0"
-NEW_HIPBLAS_VERSION="0.53.0"
+OLD_HIPBLAS_VERSION="0.53.0"
+NEW_HIPBLAS_VERSION="0.54.0"
 
-OLD_MINIMUM_ROCBLAS_VERSION="2.45.0"
-NEW_MINIMUM_ROCBLAS_VERSION="2.46.0"
+OLD_MINIMUM_ROCBLAS_VERSION="2.46.0"
+NEW_MINIMUM_ROCBLAS_VERSION="2.47.0"
 
-OLD_MINIMUM_ROCSOLVER_VERSION="3.19.0"
-NEW_MINIMUM_ROCSOLVER_VERSION="3.20.0"
+OLD_MINIMUM_ROCSOLVER_VERSION="3.20.0"
+NEW_MINIMUM_ROCSOLVER_VERSION="3.21.0"
 
 sed -i "s/${OLD_HIPBLAS_VERSION}/${NEW_HIPBLAS_VERSION}/g" CMakeLists.txt
 sed -i "s/${OLD_MINIMUM_ROCBLAS_VERSION}/${NEW_MINIMUM_ROCBLAS_VERSION}/g" CMakeLists.txt

diff --git a/clients/common/near.cpp b/clients/common/near.cpp
@@ -76,7 +76,8 @@
 
 #endif
 
-#define NEAR_ASSERT_HALF(a, b, err) ASSERT_NEAR(float(a), float(b), err)
+#define NEAR_ASSERT_HALF(a, b, err) ASSERT_NEAR(half_to_float(a), half_to_float(b), err)
+#define NEAR_ASSERT_BF16(a, b, err) ASSERT_NEAR(bfloat16_to_float(a), bfloat16_to_float(b), err)
 
 #define NEAR_ASSERT_COMPLEX(a, b, err)          \
     do                                          \
@@ -105,6 +106,13 @@ void near_check_general(
     NEAR_CHECK(M, N, 1, lda, 0, hCPU, hGPU, abs_error, NEAR_ASSERT_HALF);
 }
 
+template <>
+void near_check_general(
+    int M, int N, int lda, hipblasBfloat16* hCPU, hipblasBfloat16* hGPU, double abs_error)
+{
+    NEAR_CHECK(M, N, 1, lda, 0, hCPU, hGPU, abs_error, NEAR_ASSERT_BF16);
+}
+
 template <>
 void near_check_general(
     int M, int N, int lda, hipblasComplex* hCPU, hipblasComplex* hGPU, double abs_error)
@@ -160,6 +168,19 @@ void near_check_general(int           M,
     NEAR_CHECK(M, N, batch_count, lda, strideA, hCPU, hGPU, abs_error, NEAR_ASSERT_HALF);
 }
 
+template <>
+void near_check_general(int              M,
+                        int              N,
+                        int              batch_count,
+                        int              lda,
+                        hipblasStride    strideA,
+                        hipblasBfloat16* hCPU,
+                        hipblasBfloat16* hGPU,
+                        double           abs_error)
+{
+    NEAR_CHECK(M, N, batch_count, lda, strideA, hCPU, hGPU, abs_error, NEAR_ASSERT_BF16);
+}
+
 template <>
 void near_check_general(int             M,
                         int             N,
@@ -200,6 +221,18 @@ void near_check_general(int                      M,
     NEAR_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, abs_error, NEAR_ASSERT_HALF);
 }
 
+template <>
+void near_check_general(int                          M,
+                        int                          N,
+                        int                          batch_count,
+                        int                          lda,
+                        host_vector<hipblasBfloat16> hCPU[],
+                        host_vector<hipblasBfloat16> hGPU[],
+                        double                       abs_error)
+{
+    NEAR_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, abs_error, NEAR_ASSERT_BF16);
+}
+
 template <>
 void near_check_general(int                M,
                         int                N,
@@ -262,6 +295,18 @@ void near_check_general(int          M,
     NEAR_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, abs_error, NEAR_ASSERT_HALF);
 }
 
+template <>
+void near_check_general(int              M,
+                        int              N,
+                        int              batch_count,
+                        int              lda,
+                        hipblasBfloat16* hCPU[],
+                        hipblasBfloat16* hGPU[],
+                        double           abs_error)
+{
+    NEAR_CHECK_B(M, N, batch_count, lda, hCPU, hGPU, abs_error, NEAR_ASSERT_BF16);
+}
+
 template <>
 void near_check_general(
     int M, int N, int batch_count, int lda, float* hCPU[], float* hGPU[], double abs_error)

diff --git a/clients/include/near.h b/clients/include/near.h
@@ -79,4 +79,12 @@ void near_check_general(int            M,
                         host_vector<T> hGPU[],
                         double         abs_error);
 
+// currently only used for half-precision comparisons int dot_ex tests
+template <class T>
+HIPBLAS_CLANG_STATIC constexpr double error_tolerance = 0.0;
+
+// 2 ^ -14, smallest positive normal number for IEEE16
+template <>
+HIPBLAS_CLANG_STATIC constexpr double error_tolerance<hipblasHalf> = 0.000061035;
+
 #endif
diff --git a/clients/include/testing_dot_batched_ex.hpp b/clients/include/testing_dot_batched_ex.hpp
@@ -105,7 +105,7 @@ hipblasStatus_t testing_dot_batched_ex_template(const Arguments& argus)
     double gpu_time_used, hipblas_error_host, hipblas_error_device;
 
     // Initial Data on CPU
-    hipblas_init(hy, true, true);
+    hipblas_init(hy, true, false);
     hipblas_init_alternating_sign(hx);
     CHECK_HIP_ERROR(dx.transfer_from(hx));
     CHECK_HIP_ERROR(dy.transfer_from(hy));
@@ -159,8 +159,31 @@ hipblasStatus_t testing_dot_batched_ex_template(const Arguments& argus)
 
         if(argus.unit_check)
         {
-            unit_check_general<Tr>(1, batch_count, 1, h_cpu_result, h_hipblas_result_host);
-            unit_check_general<Tr>(1, batch_count, 1, h_cpu_result, h_hipblas_result_device);
+            if(std::is_same<Tr, hipblasHalf>{})
+            {
+                double tol = error_tolerance<Tr> * N;
+                near_check_general(1,
+                                   1,
+                                   batch_count,
+                                   1,
+                                   1,
+                                   h_cpu_result.data(),
+                                   h_hipblas_result_host.data(),
+                                   tol);
+                near_check_general(1,
+                                   1,
+                                   batch_count,
+                                   1,
+                                   1,
+                                   h_cpu_result.data(),
+                                   h_hipblas_result_device.data(),
+                                   tol);
+            }
+            else
+            {
+                unit_check_general<Tr>(1, batch_count, 1, h_cpu_result, h_hipblas_result_host);
+                unit_check_general<Tr>(1, batch_count, 1, h_cpu_result, h_hipblas_result_device);
+            }
         }
         if(argus.norm_check)
         {

diff --git a/clients/include/testing_dot_ex.hpp b/clients/include/testing_dot_ex.hpp
@@ -147,8 +147,17 @@ hipblasStatus_t testing_dot_ex_template(const Arguments& argus)
 
         if(argus.unit_check)
         {
-            unit_check_general<Tr>(1, 1, 1, &cpu_result, &hipblas_result_host);
-            unit_check_general<Tr>(1, 1, 1, &cpu_result, &hipblas_result_device);
+            if(std::is_same<Tr, hipblasHalf>{})
+            {
+                double tol = error_tolerance<Tr> * N;
+                near_check_general(1, 1, 1, &cpu_result, &hipblas_result_host, tol);
+                near_check_general(1, 1, 1, &cpu_result, &hipblas_result_device, tol);
+            }
+            else
+            {
+                unit_check_general<Tr>(1, 1, 1, &cpu_result, &hipblas_result_host);
+                unit_check_general<Tr>(1, 1, 1, &cpu_result, &hipblas_result_device);
+            }
         }
         if(argus.norm_check)
         {

diff --git a/clients/include/testing_dot_strided_batched_ex.hpp b/clients/include/testing_dot_strided_batched_ex.hpp
@@ -180,8 +180,31 @@ hipblasStatus_t testing_dot_strided_batched_ex_template(const Arguments& argus)
 
         if(argus.unit_check)
         {
-            unit_check_general<Tr>(1, batch_count, 1, h_cpu_result, h_hipblas_result_host);
-            unit_check_general<Tr>(1, batch_count, 1, h_cpu_result, h_hipblas_result_device);
+            if(std::is_same<Tr, hipblasHalf>{})
+            {
+                double tol = error_tolerance<Tr> * N;
+                near_check_general(1,
+                                   1,
+                                   batch_count,
+                                   1,
+                                   1,
+                                   h_cpu_result.data(),
+                                   h_hipblas_result_host.data(),
+                                   tol);
+                near_check_general(1,
+                                   1,
+                                   batch_count,
+                                   1,
+                                   1,
+                                   h_cpu_result.data(),
+                                   h_hipblas_result_device.data(),
+                                   tol);
+            }
+            else
+            {
+                unit_check_general<Tr>(1, batch_count, 1, h_cpu_result, h_hipblas_result_host);
+                unit_check_general<Tr>(1, batch_count, 1, h_cpu_result, h_hipblas_result_device);
+            }
         }
         if(argus.norm_check)
         {

diff --git a/scripts/performance/blas/commandrunner.py b/scripts/performance/blas/commandrunner.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Copyright (C) 2018-2020 Advanced Micro Devices, Inc. All rights reserved.
+"""Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -73,6 +73,7 @@
 import subprocess
 import sys
 import time
+from decimal import Decimal
 
 import getspecs
 
@@ -113,8 +114,13 @@ def import_rocm_smi(install_path):
     global smi_imported
     if not smi_imported:
         smi_imported = True
+        host_rocm_ver = Decimal('.'.join(getspecs.getrocmversion().split('.')[0:2])) # get host's rocm major.minor version
+        rocm_5_2_ver = Decimal('5.2')
         try:
-            sys.path.append(os.path.join(install_path, 'bin'))
+            if rocm_5_2_ver.compare(host_rocm_ver) == 1:
+                sys.path.append(os.path.join(install_path, 'bin')) # For versions below ROCm 5.2
+            else:
+                sys.path.append(os.path.join(install_path, 'libexec/rocm_smi')) # For versions equal or above ROCm 5.2
             import rocm_smi
             smi = rocm_smi