From 234c6a96bfb4ad36e2478b4c8f6545d82b85ed8e Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 1 Oct 2024 16:55:45 +0000 Subject: [PATCH 1/2] Workaround for ASYNC statements in TRMTOL for AMD GPUs Co-authored-by: Paul Mullowney --- src/trans/gpu/internal/trmtol_mod.F90 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 4fc67d55..1ccc9ab3 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -158,7 +158,11 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #ifdef OMPGPU #endif #ifdef ACCGPU +#ifdef __HIP_PLATFORM_AMD__ # Workaround for AMD GPUs - ASYNC execution of this kernel gives numerical errors + !$ACC KERNELS DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) +#else !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) +#endif #endif PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) #ifdef OMPGPU @@ -213,7 +217,9 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) CALL GSTATS(421,1) #ifdef ACCGPU +#ifndef __HIP_PLATFORM_AMD__ # Workaround for AMD GPUs - ASYNC execution of this kernel gives numerical errors !$ACC WAIT(1) +#endif #endif CALL GSTATS(807,1) ELSE From 6840754f2c9e8cd003da391012ee8ffa177c0bf5 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 2 Oct 2024 12:43:38 +0300 Subject: [PATCH 2/2] Workaround for supporting GPU-aware MPI on Cray/AMD platforms Co-authored-by: Paul Mullowney Co-authored-by: Willem Deconinck --- src/trans/gpu/internal/trgtol_mod.F90 | 1 + src/trans/gpu/internal/trltog_mod.F90 | 1 + src/trans/gpu/internal/trltom_mod.F90 | 3 ++- src/trans/gpu/internal/trmtol_mod.F90 | 7 +++++-- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 6ec388d7..7ec495ef 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -115,6 +115,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE OML_MOD, ONLY: OML_MY_THREAD #if ECTRANS_HAVE_MPI USE MPI_F08, ONLY: MPI_COMM, MPI_REQUEST, MPI_FLOAT, MPI_DOUBLE + ! Missing: MPI_ISEND, MPI_IRECV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE TPM_TRANS, ONLY: NPROMA diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 1c0a1990..46dec874 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -116,6 +116,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS #if ECTRANS_HAVE_MPI USE MPI_F08, ONLY: MPI_COMM, MPI_REQUEST, MPI_FLOAT, MPI_DOUBLE + ! Missing: MPI_ISEND, MPI_IRECV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE TPM_TRANS, ONLY: LDIVGP, LSCDERS, LUVDER, LVORGP, NPROMA diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index ab355613..09596eba 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -94,7 +94,8 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) USE TPM_DISTR, ONLY: D, NPRTRW, NPROC, MYPROC, MYSETW USE TPM_GEN, ONLY: LSYNC_TRANS, NERR #if ECTRANS_HAVE_MPI - USE MPI_F08, ONLY: MPI_COMM, MPI_FLOAT, MPI_DOUBLE, MPI_ALLTOALLV + USE MPI_F08, ONLY: MPI_COMM, MPI_FLOAT, MPI_DOUBLE + ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 1ccc9ab3..2b8ca897 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -95,6 +95,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) USE TPM_GEN, ONLY: LSYNC_TRANS, NERR #if ECTRANS_HAVE_MPI USE MPI_F08, ONLY: MPI_COMM, MPI_FLOAT, MPI_DOUBLE + ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX @@ -158,7 +159,8 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #ifdef OMPGPU #endif #ifdef ACCGPU -#ifdef __HIP_PLATFORM_AMD__ # Workaround for AMD GPUs - ASYNC execution of this kernel gives numerical errors +#ifdef __HIP_PLATFORM_AMD__ + ! Workaround for AMD GPUs - ASYNC execution of this kernel gives numerical errors !$ACC KERNELS DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) #else !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) @@ -217,7 +219,8 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) CALL GSTATS(421,1) #ifdef ACCGPU -#ifndef __HIP_PLATFORM_AMD__ # Workaround for AMD GPUs - ASYNC execution of this kernel gives numerical errors +#ifndef __HIP_PLATFORM_AMD__ + ! Workaround for AMD GPUs - ASYNC execution of this kernel gives numerical errors !$ACC WAIT(1) #endif #endif