From 207a686a7894d8a8dc46f71d46687cabbe15cbe6 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 16 Apr 2024 09:35:28 +0000 Subject: [PATCH 1/2] Add logic to avoid reallocating ZCOMBUF[RS] at each call This is taken from orphaned ifs-source PR https://git.ecmwf.int/projects/IFS/repos/ifs-source/pull-requests/1083/overview. Co-authored-by: Olivier Marsden --- src/trans/internal/trgtol_mod.F90 | 53 +++++++++++++++++++++++-------- src/trans/internal/trltog_mod.F90 | 37 ++++++++++++++++----- 2 files changed, 68 insertions(+), 22 deletions(-) diff --git a/src/trans/internal/trgtol_mod.F90 b/src/trans/internal/trgtol_mod.F90 index 064da76a2..0d04d2889 100644 --- a/src/trans/internal/trgtol_mod.F90 +++ b/src/trans/internal/trgtol_mod.F90 @@ -292,23 +292,44 @@ SUBROUTINE TRGTOL_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) -REAL(KIND=JPRB), ALLOCATABLE :: ZCOMBUFS(:,:) -REAL(KIND=JPRB), ALLOCATABLE :: ZCOMBUFR(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR(:,:) +INTEGER(KIND=JPIM), SAVE :: INRECV_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: INSEND_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: IRECVCOUNT_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: ISENDCOUNT_PREV = -1 + +IF ( .NOT. ALLOCATED(ZCOMBUFS) ) THEN + ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + ISENDCOUNT_PREV = KSENDCOUNT + INSEND_PREV = KNSEND +ELSEIF ( KSENDCOUNT .NE. ISENDCOUNT_PREV .OR. KNSEND .NE. INSEND_PREV ) THEN + DEALLOCATE(ZCOMBUFS) + ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + ISENDCOUNT_PREV = KSENDCOUNT + INSEND_PREV = KNSEND +ENDIF -ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) ! Now, force the OS to allocate this shared array right now, not when it starts to be used which is ! an OPEN-MP loop, that would cause a threads synchronization lock : -IF (KNSEND > 0 .AND. KSENDCOUNT >=-1) ZCOMBUFS(-1,1)=HUGE(1._JPRB) -ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) +IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS(-1,1) = HUGE(1._JPRB) + +IF ( .NOT. ALLOCATED(ZCOMBUFR) ) THEN + ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + IRECVCOUNT_PREV = KRECVCOUNT + INRECV_PREV = KNRECV +ELSEIF ( KRECVCOUNT .NE. IRECVCOUNT_PREV .OR. KNRECV .NE. INRECV_PREV ) THEN + DEALLOCATE(ZCOMBUFR) + ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + IRECVCOUNT_PREV = KRECVCOUNT + INRECV_PREV = KNRECV +ENDIF CALL TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& & ZCOMBUFS,ZCOMBUFR, & & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2) -DEALLOCATE(ZCOMBUFR) -DEALLOCATE(ZCOMBUFS) - END SUBROUTINE TRGTOL_COMM_HEAP SUBROUTINE TRGTOL_COMM_STACK(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& @@ -796,8 +817,9 @@ SUBROUTINE TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& !....Pack+send loop......................................................... -!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JBLK,IFIRST,ILAST,& -!$OMP& INS,ISEND,ISETA,ISETB,ISETV,IFLD,IPOS,JFLD) +!$OMP PARALLEL PRIVATE(JBLK,IFIRST,ILAST,ISEND_FLD_START,ISEND_FLD_END,INS,ISEND,ISETA,ISETB,& +!$OMP& ISETV,IFLD,IFLDT,IPOS,JFLD,JK,JJ,JI) +!$OMP DO SCHEDULE(STATIC) DO INS=1,KNSEND ISEND=KSEND(INS) CALL PE2SET(ISEND,ISETA,ISETB,ISETW(INS),ISETV) @@ -825,8 +847,7 @@ SUBROUTINE TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& PCOMBUFS(0,INS) = IFLD ENDDO -!$OMP END PARALLEL DO - +!$OMP END DO DO INS=1,KNSEND ISEND=KSEND(INS) IPOS=IPOSPLUS(INS) @@ -834,7 +855,7 @@ SUBROUTINE TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& ISEND_FLD_START=PCOMBUFS(-1,INS) ISEND_FLD_END = PCOMBUFS(0,INS) -!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(IFLDT,JBLK,IFIRST,ILAST,JK,JJ,JI) +!$OMP DO SCHEDULE(STATIC) DO JJ=ISEND_FLD_START,ISEND_FLD_END IFLDT=IFLDA(JJ,INS) DO JBLK=1,NGPBLKS @@ -877,8 +898,12 @@ SUBROUTINE TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& ENDIF ENDDO ENDDO -!$OMP END PARALLEL DO +!$OMP END DO +ENDDO +!$OMP END PARALLEL +DO INS=1,KNSEND + ISEND=KSEND(INS) IF (NTRANS_SYNC_LEVEL <= 1) THEN CALL MPL_SEND(PCOMBUFS(-1:KSENDTOT(ISEND),INS),KDEST=NPRCIDS(ISEND), & & KMP_TYPE=JP_NON_BLOCKING_STANDARD,KREQUEST=IREQ_SEND(INS), & diff --git a/src/trans/internal/trltog_mod.F90 b/src/trans/internal/trltog_mod.F90 index b50f55ede..4dbe2f90b 100644 --- a/src/trans/internal/trltog_mod.F90 +++ b/src/trans/internal/trltog_mod.F90 @@ -315,14 +315,38 @@ SUBROUTINE TRLTOG_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& INTEGER(KIND=JPIM), INTENT(IN) :: KSETWL(NPROC) INTEGER(KIND=JPIM), INTENT(IN) :: KSETVL(NPROC) -REAL(KIND=JPRB), ALLOCATABLE :: ZCOMBUFS(:,:) -REAL(KIND=JPRB), ALLOCATABLE :: ZCOMBUFR(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR(:,:) +INTEGER(KIND=JPIM), SAVE :: INRECV_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: INSEND_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: IRECVCOUNT_PREV = -1 +INTEGER(KIND=JPIM), SAVE :: ISENDCOUNT_PREV = -1 + +IF ( .NOT. ALLOCATED(ZCOMBUFS) ) THEN + ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + ISENDCOUNT_PREV = KSENDCOUNT + INSEND_PREV = KNSEND +ELSEIF ( KSENDCOUNT .NE. ISENDCOUNT_PREV .OR. KNSEND .NE. INSEND_PREV ) THEN + DEALLOCATE(ZCOMBUFS) + ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + ISENDCOUNT_PREV = KSENDCOUNT + INSEND_PREV = KNSEND +ENDIF -ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) ! Now, force the OS to allocate this shared array right now, not when it starts to be used which is ! an OPEN-MP loop, that would cause a threads synchronization lock : -IF (KNSEND > 0 .AND. KSENDCOUNT >=-1) ZCOMBUFS(-1,1)=HUGE(1._JPRB) -ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) +IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS(-1,1) = HUGE(1._JPRB) + +IF ( .NOT. ALLOCATED(ZCOMBUFR) ) THEN + ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + IRECVCOUNT_PREV = KRECVCOUNT + INRECV_PREV = KNRECV +ELSEIF ( KRECVCOUNT .NE. IRECVCOUNT_PREV .OR. KNRECV .NE. INRECV_PREV ) THEN + DEALLOCATE(ZCOMBUFR) + ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + IRECVCOUNT_PREV = KRECVCOUNT + INRECV_PREV = KNRECV +ENDIF CALL TRLTOG_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& @@ -330,9 +354,6 @@ SUBROUTINE TRLTOG_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2, & & KSETAL, KSETBL,KSETWL,KSETVL) -DEALLOCATE(ZCOMBUFR) -DEALLOCATE(ZCOMBUFS) - END SUBROUTINE TRLTOG_COMM_HEAP SUBROUTINE TRLTOG_COMM_STACK(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& From 9000e8e354390b74779c381c3eef75ba898693c3 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 7 May 2024 09:07:08 +0000 Subject: [PATCH 2/2] Add _STACK and _HEAP qualifiers to GTOL and LTOG work buffers --- src/trans/internal/trgtol_mod.F90 | 30 +++++++++++++++--------------- src/trans/internal/trltog_mod.F90 | 30 +++++++++++++++--------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/trans/internal/trgtol_mod.F90 b/src/trans/internal/trgtol_mod.F90 index 0d04d2889..9ee2da359 100644 --- a/src/trans/internal/trgtol_mod.F90 +++ b/src/trans/internal/trgtol_mod.F90 @@ -292,42 +292,42 @@ SUBROUTINE TRGTOL_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) -REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS(:,:) -REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS_HEAP(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR_HEAP(:,:) INTEGER(KIND=JPIM), SAVE :: INRECV_PREV = -1 INTEGER(KIND=JPIM), SAVE :: INSEND_PREV = -1 INTEGER(KIND=JPIM), SAVE :: IRECVCOUNT_PREV = -1 INTEGER(KIND=JPIM), SAVE :: ISENDCOUNT_PREV = -1 -IF ( .NOT. ALLOCATED(ZCOMBUFS) ) THEN - ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) +IF ( .NOT. ALLOCATED(ZCOMBUFS_HEAP) ) THEN + ALLOCATE(ZCOMBUFS_HEAP(-1:KSENDCOUNT,KNSEND)) ISENDCOUNT_PREV = KSENDCOUNT INSEND_PREV = KNSEND ELSEIF ( KSENDCOUNT .NE. ISENDCOUNT_PREV .OR. KNSEND .NE. INSEND_PREV ) THEN - DEALLOCATE(ZCOMBUFS) - ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + DEALLOCATE(ZCOMBUFS_HEAP) + ALLOCATE(ZCOMBUFS_HEAP(-1:KSENDCOUNT,KNSEND)) ISENDCOUNT_PREV = KSENDCOUNT INSEND_PREV = KNSEND ENDIF ! Now, force the OS to allocate this shared array right now, not when it starts to be used which is ! an OPEN-MP loop, that would cause a threads synchronization lock : -IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS(-1,1) = HUGE(1._JPRB) +IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS_HEAP(-1,1) = HUGE(1._JPRB) -IF ( .NOT. ALLOCATED(ZCOMBUFR) ) THEN - ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) +IF ( .NOT. ALLOCATED(ZCOMBUFR_HEAP) ) THEN + ALLOCATE(ZCOMBUFR_HEAP(-1:KRECVCOUNT,KNRECV)) IRECVCOUNT_PREV = KRECVCOUNT INRECV_PREV = KNRECV ELSEIF ( KRECVCOUNT .NE. IRECVCOUNT_PREV .OR. KNRECV .NE. INRECV_PREV ) THEN - DEALLOCATE(ZCOMBUFR) - ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + DEALLOCATE(ZCOMBUFR_HEAP) + ALLOCATE(ZCOMBUFR_HEAP(-1:KRECVCOUNT,KNRECV)) IRECVCOUNT_PREV = KRECVCOUNT INRECV_PREV = KNRECV ENDIF CALL TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& - & ZCOMBUFS,ZCOMBUFR, & + & ZCOMBUFS_HEAP,ZCOMBUFR_HEAP, & & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2) END SUBROUTINE TRGTOL_COMM_HEAP @@ -364,12 +364,12 @@ SUBROUTINE TRGTOL_COMM_STACK(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP3B(:,:,:,:) REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP2(:,:,:) -REAL(KIND=JPRB) :: ZCOMBUFS(-1:KSENDCOUNT,KNSEND) -REAL(KIND=JPRB) :: ZCOMBUFR(-1:KRECVCOUNT,KNRECV) +REAL(KIND=JPRB) :: ZCOMBUFS_STACK(-1:KSENDCOUNT,KNSEND) +REAL(KIND=JPRB) :: ZCOMBUFR_STACK(-1:KRECVCOUNT,KNRECV) CALL TRGTOL_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& - & ZCOMBUFS,ZCOMBUFR, & + & ZCOMBUFS_STACK,ZCOMBUFR_STACK, & & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2) END SUBROUTINE TRGTOL_COMM_STACK diff --git a/src/trans/internal/trltog_mod.F90 b/src/trans/internal/trltog_mod.F90 index 4dbe2f90b..ef0e06c59 100644 --- a/src/trans/internal/trltog_mod.F90 +++ b/src/trans/internal/trltog_mod.F90 @@ -315,42 +315,42 @@ SUBROUTINE TRLTOG_COMM_HEAP(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& INTEGER(KIND=JPIM), INTENT(IN) :: KSETWL(NPROC) INTEGER(KIND=JPIM), INTENT(IN) :: KSETVL(NPROC) -REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS(:,:) -REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFS_HEAP(:,:) +REAL(KIND=JPRB), ALLOCATABLE, SAVE :: ZCOMBUFR_HEAP(:,:) INTEGER(KIND=JPIM), SAVE :: INRECV_PREV = -1 INTEGER(KIND=JPIM), SAVE :: INSEND_PREV = -1 INTEGER(KIND=JPIM), SAVE :: IRECVCOUNT_PREV = -1 INTEGER(KIND=JPIM), SAVE :: ISENDCOUNT_PREV = -1 -IF ( .NOT. ALLOCATED(ZCOMBUFS) ) THEN - ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) +IF ( .NOT. ALLOCATED(ZCOMBUFS_HEAP) ) THEN + ALLOCATE(ZCOMBUFS_HEAP(-1:KSENDCOUNT,KNSEND)) ISENDCOUNT_PREV = KSENDCOUNT INSEND_PREV = KNSEND ELSEIF ( KSENDCOUNT .NE. ISENDCOUNT_PREV .OR. KNSEND .NE. INSEND_PREV ) THEN - DEALLOCATE(ZCOMBUFS) - ALLOCATE(ZCOMBUFS(-1:KSENDCOUNT,KNSEND)) + DEALLOCATE(ZCOMBUFS_HEAP) + ALLOCATE(ZCOMBUFS_HEAP(-1:KSENDCOUNT,KNSEND)) ISENDCOUNT_PREV = KSENDCOUNT INSEND_PREV = KNSEND ENDIF ! Now, force the OS to allocate this shared array right now, not when it starts to be used which is ! an OPEN-MP loop, that would cause a threads synchronization lock : -IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS(-1,1) = HUGE(1._JPRB) +IF (KNSEND > 0 .AND. KSENDCOUNT >= -1) ZCOMBUFS_HEAP(-1,1) = HUGE(1._JPRB) -IF ( .NOT. ALLOCATED(ZCOMBUFR) ) THEN - ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) +IF ( .NOT. ALLOCATED(ZCOMBUFR_HEAP) ) THEN + ALLOCATE(ZCOMBUFR_HEAP(-1:KRECVCOUNT,KNRECV)) IRECVCOUNT_PREV = KRECVCOUNT INRECV_PREV = KNRECV ELSEIF ( KRECVCOUNT .NE. IRECVCOUNT_PREV .OR. KNRECV .NE. INRECV_PREV ) THEN - DEALLOCATE(ZCOMBUFR) - ALLOCATE(ZCOMBUFR(-1:KRECVCOUNT,KNRECV)) + DEALLOCATE(ZCOMBUFR_HEAP) + ALLOCATE(ZCOMBUFR_HEAP(-1:KRECVCOUNT,KNRECV)) IRECVCOUNT_PREV = KRECVCOUNT INRECV_PREV = KNRECV ENDIF CALL TRLTOG_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& - & ZCOMBUFS,ZCOMBUFR, & + & ZCOMBUFS_HEAP,ZCOMBUFR_HEAP, & & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2, & & KSETAL, KSETBL,KSETWL,KSETVL) @@ -393,12 +393,12 @@ SUBROUTINE TRLTOG_COMM_STACK(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& INTEGER(KIND=JPIM), INTENT(IN) :: KSETWL(NPROC) INTEGER(KIND=JPIM), INTENT(IN) :: KSETVL(NPROC) -REAL(KIND=JPRB) :: ZCOMBUFS(-1:KSENDCOUNT,KNSEND) -REAL(KIND=JPRB) :: ZCOMBUFR(-1:KRECVCOUNT,KNRECV) +REAL(KIND=JPRB) :: ZCOMBUFS_STACK(-1:KSENDCOUNT,KNSEND) +REAL(KIND=JPRB) :: ZCOMBUFR_STACK(-1:KRECVCOUNT,KNRECV) CALL TRLTOG_COMM(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,& & KSENDCOUNT,KRECVCOUNT,KNSEND,KNRECV,KSENDTOT,KRECVTOT,KSEND,KRECV,KINDEX,KNDOFF,KGPTRSEND,& - & ZCOMBUFS,ZCOMBUFR, & + & ZCOMBUFS_STACK,ZCOMBUFR_STACK, & & KPTRGP,PGP,PGPUV,PGP3A,PGP3B,PGP2, & & KSETAL, KSETBL,KSETWL,KSETVL)