Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make in-place FFT optional #155

Merged
merged 2 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/trans/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@ if( HAVE_HIP )
algor/hicblas_gemm.hip.cpp
algor/hicfft.hip.cpp
)
ecbuild_info("warn: IN_PLACE_FFT not defined for hipFFT")
elseif( HAVE_CUDA )
set( GPU_RUNTIME "CUDA" )
set( ECTRANS_GPU_HIP_LIBRARIES CUDA::cufft CUDA::cublas nvhpcwrapnvtx CUDA::cudart )
list( APPEND trans_gpu_common_src
algor/hicblas_gemm.cuda.cu
algor/hicfft.cuda.cu
)
ecbuild_info("warn: IN_PLACE_FFT defined for cuFFT")
else()
ecbuild_info("warn: HIP and CUDA not found")
endif()
Expand Down Expand Up @@ -166,6 +168,11 @@ foreach( prec dp sp )
target_compile_definitions( ectrans_gpu_${prec} PRIVATE TRANS_SINGLE PARKINDTRANS_SINGLE )
endif()

# cuFFT can do in-place FFT, hipFFT cannot
if( HAVE_CUDA )
target_compile_definitions( ectrans_gpu_${prec} PRIVATE IN_PLACE_FFT )
endif()

if( HAVE_OMP AND CMAKE_Fortran_COMPILER_ID MATCHES Cray )
# Propagate flags as link options for downstream targets. Only required for Cray
target_link_options( ectrans_gpu_${prec} INTERFACE
Expand Down
2 changes: 1 addition & 1 deletion src/trans/gpu/internal/dir_trans_ctl_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,&
ALLOCATOR = MAKE_BUFFERED_ALLOCATOR()
HTRGTOL = PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS)
IF (KF_FS > 0) THEN
HFTDIR = PREPARE_FTDIR()
HFTDIR = PREPARE_FTDIR(ALLOCATOR,KF_FS)
HTRLTOM_PACK = PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS)
HTRLTOM = PREPARE_TRLTOM(ALLOCATOR, KF_FS)
HTRLTOM_UNPACK = PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS)
Expand Down
30 changes: 26 additions & 4 deletions src/trans/gpu/internal/ftdir_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,35 @@
!

MODULE FTDIR_MOD
USE BUFFERED_ALLOCATOR_MOD ,ONLY : ALLOCATION_RESERVATION_HANDLE
IMPLICIT NONE

PRIVATE
PUBLIC :: FTDIR, FTDIR_HANDLE, PREPARE_FTDIR

TYPE FTDIR_HANDLE
TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL_COMPLEX
END TYPE
CONTAINS

FUNCTION PREPARE_FTDIR() RESULT(HFTDIR)
FUNCTION PREPARE_FTDIR(ALLOCATOR,KF_FS) RESULT(HFTDIR)
USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT
USE TPM_DISTR, ONLY: D
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
USE ISO_C_BINDING, ONLY: C_SIZE_T

IMPLICIT NONE

TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS
TYPE(FTDIR_HANDLE) :: HFTDIR
END FUNCTION

REAL(KIND=JPRBT) :: DUMMY

#ifndef IN_PLACE_FFT
HFTDIR%HREEL_COMPLEX = RESERVE(ALLOCATOR, INT(KF_FS*D%NLENGTF*SIZEOF(DUMMY), KIND=C_SIZE_T))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SIZEOF is a non-standard extension. Some compilers could complain.
Standard in F2008 is STORAGE_SIZE, which gives you bits (not bytes!) and C_SIZEOF, which gives you bytes.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SIZEOF occurs in a few places in the GPU tree. @lukasm91 any strong feelings about switching to 8*STORAGE_SIZE(DUMMY)?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for letting me know... for me, fortran is learning by doing - I simple didn't realize that this is not in the standard, so this is good to know and learning about the right way to do it :)

#endif
END FUNCTION PREPARE_FTDIR

SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD)
!**** *FTDIR - Direct Fourier transform
Expand Down Expand Up @@ -60,12 +76,13 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD)
USE TPM_GEN, ONLY: LSYNC_TRANS
USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT
USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D_NSTAGT0B, D_NSTAGTF,D_NPTRLS, &
& D_NPNTGTB0, D_NPROCM, D_NDGL_FS
& D_NPNTGTB0, D_NPROCM, D_NDGL_FS, D
USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
USE TPM_HICFFT, ONLY: EXECUTE_DIR_FFT
USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM
USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX
USE ISO_C_BINDING, ONLY: C_SIZE_T

IMPLICIT NONE

Expand All @@ -77,7 +94,12 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD)

INTEGER(KIND=JPIM) :: KGL

#ifdef IN_PLACE_FFT
PREEL_COMPLEX => PREEL_REAL
#else
CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HFTDIR%HREEL_COMPLEX),&
& 1_C_SIZE_T, INT(KFIELD*D%NLENGTF*SIZEOF(PREEL_COMPLEX(1)),KIND=C_SIZE_T))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also here

#endif

#ifdef ACCGPU
!$ACC DATA PRESENT(PREEL_REAL, PREEL_COMPLEX, &
Expand Down
38 changes: 29 additions & 9 deletions src/trans/gpu/internal/ftinv_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,33 @@
!

MODULE FTINV_MOD
USE BUFFERED_ALLOCATOR_MOD ,ONLY : BUFFERED_ALLOCATOR
USE BUFFERED_ALLOCATOR_MOD ,ONLY : BUFFERED_ALLOCATOR, ALLOCATION_RESERVATION_HANDLE
IMPLICIT NONE

PRIVATE
PUBLIC :: FTINV, FTINV_HANDLE, PREPARE_FTINV

TYPE FTINV_HANDLE
TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL_REAL
END TYPE
CONTAINS
FUNCTION PREPARE_FTINV(ALLOCATOR) RESULT(HFTINV)
FUNCTION PREPARE_FTINV(ALLOCATOR,KF_FS) RESULT(HFTINV)
USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT
USE TPM_DISTR, ONLY: D
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
USE ISO_C_BINDING, ONLY: C_SIZE_T

IMPLICIT NONE

TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS
TYPE(FTINV_HANDLE) :: HFTINV

REAL(KIND=JPRBT) :: DUMMY

#ifndef IN_PLACE_FFT
HFTINV%HREEL_REAL = RESERVE(ALLOCATOR, INT(D%NLENGTF*KF_FS*SIZEOF(DUMMY),KIND=C_SIZE_T))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and here

#endif
END FUNCTION

SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD)
Expand Down Expand Up @@ -59,13 +72,15 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD)
! G. Mozdzynski (Jun 2015): Support alternative FFTs to FFTW
! ------------------------------------------------------------------

USE TPM_GEN, ONLY: LSYNC_TRANS
USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT
USE TPM_DISTR, ONLY: MYSETW, D_NPTRLS, D_NDGL_FS, D_NSTAGTF
USE TPM_GEOMETRY, ONLY: G_NLOEN
USE TPM_HICFFT, ONLY: EXECUTE_INV_FFT
USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM
USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX
USE TPM_GEN, ONLY: LSYNC_TRANS
USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT
USE TPM_DISTR, ONLY: MYSETW, D_NPTRLS, D_NDGL_FS, D_NSTAGTF, D
USE TPM_GEOMETRY, ONLY: G_NLOEN
USE TPM_HICFFT, ONLY: EXECUTE_INV_FFT
USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM
USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX
USE BUFFERED_ALLOCATOR_MOD, ONLY: ASSIGN_PTR, GET_ALLOCATION
USE ISO_C_BINDING, ONLY: C_SIZE_T

IMPLICIT NONE

Expand All @@ -77,7 +92,12 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD)

INTEGER(KIND=JPIM) :: KGL

#ifdef IN_PLACE_FFT
PREEL_REAL => PREEL_COMPLEX
#else
CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HFTINV%HREEL_REAL),&
& 1_C_SIZE_T, INT(KFIELD*D%NLENGTF*SIZEOF(PREEL_REAL(1)),KIND=C_SIZE_T))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and here..
Perhaps there are more instances in the code. This seems like a typical copy-paste-edit line.

#endif

#ifdef OMPGPU
#endif
Expand Down
2 changes: 1 addition & 1 deletion src/trans/gpu/internal/inv_trans_ctl_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,&
HTRMTOL = PREPARE_TRMTOL(ALLOCATOR,IF_LEG)
HTRMTOL_UNPACK = PREPARE_TRMTOL_UNPACK(ALLOCATOR,IF_FOURIER)
HFSC = PREPARE_FSC(ALLOCATOR)
HFTINV = PREPARE_FTINV(ALLOCATOR)
HFTINV = PREPARE_FTINV(ALLOCATOR,IF_FOURIER)
ENDIF
HTRLTOG = PREPARE_TRLTOG(ALLOCATOR,IF_FOURIER,KF_GP)

Expand Down
Loading