From b1335bd83ad11cff4e9e1fe714be168f9bf6e75d Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Wed, 26 Apr 2023 09:41:44 +0200 Subject: [PATCH 01/11] updates github workflow checkout version --- .github/workflows/CI.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 6eecc1f61..80dac582c 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Check for changes id: diff @@ -75,7 +75,7 @@ jobs: needs: changesCheck steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install dependencies run: | @@ -128,7 +128,7 @@ jobs: os: [ubuntu-latest,ubuntu-20.04] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install packages run: ./.github/scripts/run_install.sh @@ -147,7 +147,7 @@ jobs: needs: changesCheck steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Cache Intel oneapi packages id: cache-intel-oneapi @@ -224,7 +224,7 @@ jobs: needs: [linuxCheck] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install packages run: ./.github/scripts/run_install.sh @@ -242,7 +242,7 @@ jobs: needs: [linuxCheck] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install packages run: ./.github/scripts/run_install.sh @@ -267,7 +267,7 @@ jobs: needs: [linuxCheck] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install packages run: ./.github/scripts/run_install.sh @@ -291,7 +291,7 @@ jobs: needs: [linuxCheck] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install packages run: ./.github/scripts/run_install.sh @@ -315,7 +315,7 @@ jobs: needs: [linuxCheck] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install packages run: ./.github/scripts/run_install.sh @@ -340,7 +340,7 @@ jobs: needs: [linuxCheck] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install packages run: ./.github/scripts/run_install.sh @@ -364,7 +364,7 @@ jobs: needs: [linuxCheck] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install packages run: ./.github/scripts/run_install.sh From 01d2ddde6a0ea88d97dea895332286f3704dd508 Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Wed, 26 Apr 2023 09:49:38 +0200 Subject: [PATCH 02/11] updates default CUDA compute code for Maxwell cards (Quadro M6000, GeForce 900, GTX-970, GTX-980, GTX Titan X) --- Makefile.in | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile.in b/Makefile.in index 5cf182b5f..9fe5484de 100644 --- a/Makefile.in +++ b/Makefile.in @@ -150,6 +150,7 @@ GENCODE_30 = -gencode=arch=compute_30,code=\"sm_30,compute_30\" GENCODE_35 = -gencode=arch=compute_35,code=\"sm_35,compute_35\" GENCODE_37 = -gencode=arch=compute_37,code=\"sm_37\" GENCODE_50 = -gencode=arch=compute_50,code=\"sm_50,compute_50\" +GENCODE_52 = -gencode=arch=compute_52,code=\"sm_52,compute_52\" GENCODE_60 = -gencode=arch=compute_60,code=\"sm_60,compute_60\" GENCODE_70 = -gencode=arch=compute_70,code=\"sm_70,compute_70\" GENCODE_75 = -gencode=arch=compute_75,code=\"sm_75,compute_75\" @@ -165,7 +166,7 @@ GENCODE_80 = -gencode=arch=compute_80,code=\"sm_80,compute_80\" # CUDA version 8.0 @COND_CUDA_TRUE@@COND_CUDA8_TRUE@GENCODE = $(GENCODE_60) $(FC_DEFINE)GPU_DEVICE_Pascal # CUDA version 7.x -@COND_CUDA_TRUE@@COND_CUDA7_TRUE@GENCODE = $(GENCODE_50) $(FC_DEFINE)GPU_DEVICE_Maxwell +@COND_CUDA_TRUE@@COND_CUDA7_TRUE@GENCODE = $(GENCODE_52) $(FC_DEFINE)GPU_DEVICE_Maxwell # CUDA version 6.5 @COND_CUDA_TRUE@@COND_CUDA6_TRUE@GENCODE = $(GENCODE_37) $(FC_DEFINE)GPU_DEVICE_K80 # CUDA version 5.x @@ -235,7 +236,7 @@ GENCODE_AMD_MI250 = --amdgpu-target=gfx90a @COND_HIP_TRUE@@COND_HIP_CUDA5_TRUE@GENCODE_HIP = $(GENCODE_35) # --with-hip=cuda5 .. @COND_HIP_TRUE@@COND_HIP_CUDA6_TRUE@GENCODE_HIP = $(GENCODE_37) # --with-hip=cuda6 .. -@COND_HIP_TRUE@@COND_HIP_CUDA7_TRUE@GENCODE_HIP = $(GENCODE_50) # --with-hip=cuda7 .. +@COND_HIP_TRUE@@COND_HIP_CUDA7_TRUE@GENCODE_HIP = $(GENCODE_52) # --with-hip=cuda7 .. @COND_HIP_TRUE@@COND_HIP_CUDA8_TRUE@GENCODE_HIP = $(GENCODE_60) # --with-hip=cuda8 .. @COND_HIP_TRUE@@COND_HIP_CUDA9_TRUE@GENCODE_HIP = $(GENCODE_70) # --with-hip=cuda9 .. @COND_HIP_TRUE@@COND_HIP_CUDA10_TRUE@GENCODE_HIP = $(GENCODE_75) # --with-hip=cuda10 .. From a8e5bfee922cb4b2c42f8cf3bfa04a74ca647a0c Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Wed, 3 May 2023 11:18:31 +0200 Subject: [PATCH 03/11] updates github workflow --- .github/workflows/CI.yml | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 80dac582c..c8e5c4cc7 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -29,9 +29,20 @@ jobs: export DIFF=$( git diff --name-only ${{ github.event.before }} $GITHUB_SHA ) echo " diff between ${{ github.event.before }} and $GITHUB_SHA" fi - echo "$DIFF" + echo "***"; echo "$DIFF"; echo "***" # Escape newlines (replace \n with %0A) - echo "::set-output name=diff::$( echo "$DIFF" | sed ':a;N;$!ba;s/\n/%0A/g' )" + # deprecated: + #echo "::set-output name=diff::$( echo "$DIFF" | sed ':a;N;$!ba;s/\n/%0A/g' )" + # new: + # replace new line with %0A - will result finding only one file with a very long name... + #echo "diff=$( echo "$DIFF" | sed ':a;N;$!ba;s/\n/%0A/g' )" >> $GITHUB_OUTPUT + # doesn't work... + #echo "diff=\"$DIFF\"" >> "$GITHUB_OUTPUT" + # new multi-line format: + # (https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#multiline-strings) + echo "diff<> $GITHUB_OUTPUT + echo "$DIFF" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT - name: Output changes run: echo "${{ steps.diff.outputs.diff }}" @@ -151,7 +162,7 @@ jobs: - name: Cache Intel oneapi packages id: cache-intel-oneapi - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: /opt/intel/oneapi key: install-${{ runner.os }}-all From 33905b3855af9b879f54b4daf0bfd84c49e1edbb Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Wed, 3 May 2023 13:05:19 +0200 Subject: [PATCH 04/11] updates azure workflow --- .azure-pipelines.yml | 73 ++++++++++++++++++++------- .azure-pipelines/install-template.yml | 61 ++++++++++++++++++---- 2 files changed, 107 insertions(+), 27 deletions(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index cb93a63e7..c31dcb278 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -72,7 +72,7 @@ jobs: # tests/ directory RUN_CHECKS=1 elif [ "$directory" == .azure-pipelines ]; then - # .travis/ directory + # azure directory RUN_CHECKS=1 fi done <<< "$DIFF" @@ -84,7 +84,8 @@ jobs: displayName: 'Run checks' - job: compilation_default - displayName: 'Compilation Default GCC 9' + # ubuntu-latest: ubuntu-22.04 w/ GCC 11 + displayName: 'Compilation Default GCC' steps: - template: .azure-pipelines/install-template.yml parameters: @@ -95,14 +96,14 @@ jobs: CUDA: false BUILD: true -- job: compilation_default_gcc7 - displayName: 'Compilation Default GCC 7' +- job: compilation_default_gcc9 + displayName: 'Compilation Default GCC 9' pool: - vmImage: 'ubuntu-18.04' + vmImage: 'ubuntu-20.04' variables: - CC: gcc-7 - CXX: g++-7 - FC: gfortran-7 + CC: gcc-9 + CXX: g++-9 + FC: gfortran-9 steps: - template: .azure-pipelines/install-template.yml parameters: @@ -131,27 +132,33 @@ jobs: CUDA: false BUILD: true -- job: compilation_CUDA10 - displayName: 'Compilation CUDA 10' +- job: compilation_CUDA11_gcc9 + displayName: 'Compilation CUDA 11 GCC 9' pool: - vmImage: 'ubuntu-18.04' + vmImage: 'ubuntu-20.04' variables: - CC: gcc-7 - CXX: g++-7 - FC: gfortran-7 + CC: gcc-9 + CXX: g++-9 + FC: gfortran-9 steps: - template: .azure-pipelines/install-template.yml parameters: CUDA: true - CUDA_V: '10.2' + CUDA_V: '11.4' - template: .azure-pipelines/configure-template.yml parameters: TESTFLAGS: '--enable-vectorization --with-cuda=cuda10' CUDA: true BUILD: true -- job: compilation_CUDA11 - displayName: 'Compilation CUDA 11' +- job: compilation_CUDA11_gcc10 + displayName: 'Compilation CUDA 11 GCC 10' + pool: + vmImage: 'ubuntu-20.04' + variables: + CC: gcc-10 + CXX: g++-10 + FC: gfortran-10 steps: - template: .azure-pipelines/install-template.yml parameters: @@ -163,6 +170,38 @@ jobs: CUDA: true BUILD: true +- job: compilation_CUDA12 + displayName: 'Compilation CUDA 12 GCC 10' + pool: + vmImage: 'ubuntu-20.04' + variables: + CC: gcc-10 + CXX: g++-10 + FC: gfortran-10 + steps: + - template: .azure-pipelines/install-template.yml + parameters: + CUDA: true + CUDA_V: '12.1' + - template: .azure-pipelines/configure-template.yml + parameters: + TESTFLAGS: '--enable-vectorization --with-cuda=cuda11' + CUDA: true + BUILD: true + +- job: compilation_CUDA12_latest + displayName: 'Compilation CUDA 12' + steps: + - template: .azure-pipelines/install-template.yml + parameters: + CUDA: true + CUDA_V: '12.1' + - template: .azure-pipelines/configure-template.yml + parameters: + TESTFLAGS: '--enable-vectorization --with-cuda=cuda12' + CUDA: true + BUILD: true + - job: test_example_1 displayName: 'Test example 1 - regional_Greece_small' dependsOn: compilation_default diff --git a/.azure-pipelines/install-template.yml b/.azure-pipelines/install-template.yml index dd67ea468..a64f71681 100644 --- a/.azure-pipelines/install-template.yml +++ b/.azure-pipelines/install-template.yml @@ -4,8 +4,10 @@ # software setup on VM nodes # ubuntu-18.04: # https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu1804-README.md -# ubuntu-20.04 "ubuntu-latest": +# ubuntu-20.04: # https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md +# ubuntu-22.04 "ubuntu-latest": +# https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md # parameters: - name: CUDA @@ -20,10 +22,27 @@ steps: # fortran/openMPI compiler echo "CC: ${CC} CXX: ${CXX} FC: ${FC}" # updates repository + echo; echo `uname -a`; lsb_release -a; echo sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 6B05F25D762E3157 sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 78BD65473CB3BD13 sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 762E3157 - if [ "${FC}" == "gfortran-10" ]; then + if [ "${FC}" == "gfortran-9" ]; then + echo "gfortran: gfortran-9 update" + # updating gfortran version + sudo add-apt-repository ppa:ubuntu-toolchain-r/test + sudo apt-get update + sudo apt-get install -y --reinstall gcc-9 g++-9 gfortran-9 + # updates alternatives + echo + update-alternatives --query gfortran + echo + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 100 + sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 100 + sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-9 100 + echo + update-alternatives --query gfortran + echo + elif [ "${FC}" == "gfortran-10" ]; then echo "gfortran: gfortran-10 update" # updating gfortran version sudo add-apt-repository ppa:ubuntu-toolchain-r/test @@ -94,9 +113,14 @@ steps: ## distribution xenial: from ubuntu 16.04 #UBUNTU_VERSION=ubuntu1604 ## distribution bionic: from ubuntu 18.04 - UBUNTU_VERSION=ubuntu1804 + #UBUNTU_VERSION=ubuntu1804 ## distribution focal: from ubuntu 20.04 #UBUNTU_VERSION=ubuntu2004 + ## distribution jammy: from ubuntu 22.04 + #UBUNTU_VERSION=ubuntu2204 + + # default + UBUNTU_VERSION=ubuntu2004 # CUDA_VERSION - specifies CUDA toolkit version echo "CUDA version: $CUDA_V" @@ -110,10 +134,20 @@ steps: elif [ "$CUDA_V" == "10.2" ]; then ## bionic CUDA_VERSION=10.2.89-1 + elif [ "$CUDA_V" == "11.4" ]; then + ## focal + CUDA_VERSION=11.4.0-1 + elif [ "$CUDA_V" == "12.1" ]; then + ## focal + CUDA_VERSION=12.1.1-1 else - # note: on azure VM nodes with ubuntu-latest, default gcc version is 9.3; + # note: - on azure VM nodes with ubuntu 18.04, default gcc version is 9.3; + # needs at least CUDA version 10.x + # - on azure VM nodes with ubuntu 20.04, default gcc version is 10.3; # needs at least CUDA version 11.x - CUDA_VERSION=11.4.0-1 + # - on azure VM nodes with ubuntu-latest (22.04), default gcc version is 11.3; + # needs at least CUDA version 11.7 + CUDA_VERSION=12.1.1-1 fi echo @@ -151,7 +185,12 @@ steps: echo # gets repo - if [ "${CUDA_VERSION}" == "11.4.0-1" ]; then + if [ "${CUDA_VERSION}" == "10.2.89-1" ]; then + # gets packages + INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA_VERSION}_${CUDA_ARCH}.deb + wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/${CUDA_OS}/${INSTALLER} + sudo dpkg -i ${INSTALLER} + elif [ "${CUDA_VERSION}" == "11.4.0-1" ]; then # new CUDA version 11.4 has no cuda-repo-** file, following instructions from website, # see: https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=18.04&target_type=deb_network wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/${CUDA_OS}/cuda-${UBUNTU_VERSION}.pin @@ -160,10 +199,12 @@ steps: # adds repo sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/${CUDA_OS}/ /" else - # gets packages - INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA_VERSION}_${CUDA_ARCH}.deb - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/${CUDA_OS}/${INSTALLER} - sudo dpkg -i ${INSTALLER} + # new versions + wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/${CUDA_OS}/cuda-${UBUNTU_VERSION}.pin + sudo mv cuda-${UBUNTU_VERSION}.pin /etc/apt/preferences.d/cuda-repository-pin-600 + echo + # adds repo + sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/${CUDA_OS}/ /" fi #echo From 5120147dd71fa3ef177ed127f7984c47290235a3 Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Wed, 3 May 2023 13:28:31 +0200 Subject: [PATCH 05/11] adds CUDA Hopper support --- Makefile.in | 9 +++++++- configure | 29 ++++++++++++++++++++++++++ configure.ac | 5 ++++- doc/USER_MANUAL/02_getting_started.tex | 1 + src/gpu/mesh_constants_cuda.h | 8 +++++++ src/gpu/mesh_constants_gpu.h | 8 +++++++ src/gpu/rules.mk | 4 +++- 7 files changed, 61 insertions(+), 3 deletions(-) diff --git a/Makefile.in b/Makefile.in index 9fe5484de..a8432819b 100644 --- a/Makefile.in +++ b/Makefile.in @@ -109,6 +109,9 @@ GPU_ELEM_PER_THREAD := 1 @COND_CUDA11_TRUE@CUDA11 = yes @COND_CUDA11_FALSE@CUDA11 = no +@COND_CUDA12_TRUE@CUDA12 = yes +@COND_CUDA12_FALSE@CUDA12 = no + # CUDA compilation with linking @COND_CUDA_PLUS_TRUE@CUDA_PLUS = yes @COND_CUDA_PLUS_FALSE@CUDA_PLUS = no @@ -144,7 +147,7 @@ CUDA_DEBUG = --cudart=shared # Volta (cuda9, V100): -gencode=arch=compute_70,code=sm_70 # Turing (cuda10, T4, GeForce RTX 2080): -gencode=arch=compute_75,code=sm_75 # Ampere (cuda11, A100, GeForce RTX 3080): -gencode=arch=compute_80,code=sm_80 - +# Hopper (cuda12, H100): -gencode=arch=compute_90,code=sm_90 GENCODE_20 = -gencode=arch=compute_20,code=\"sm_20,compute_20\" GENCODE_30 = -gencode=arch=compute_30,code=\"sm_30,compute_30\" GENCODE_35 = -gencode=arch=compute_35,code=\"sm_35,compute_35\" @@ -155,8 +158,11 @@ GENCODE_60 = -gencode=arch=compute_60,code=\"sm_60,compute_60\" GENCODE_70 = -gencode=arch=compute_70,code=\"sm_70,compute_70\" GENCODE_75 = -gencode=arch=compute_75,code=\"sm_75,compute_75\" GENCODE_80 = -gencode=arch=compute_80,code=\"sm_80,compute_80\" +GENCODE_90 = -gencode=arch=compute_90,code=\"sm_90,compute_90\" # cuda preprocessor flag +# CUDA version 12.0 +@COND_CUDA_TRUE@@COND_CUDA12_TRUE@GENCODE = $(GENCODE_90) $(FC_DEFINE)GPU_DEVICE_Hopper # CUDA version 11.0 @COND_CUDA_TRUE@@COND_CUDA11_TRUE@GENCODE = $(GENCODE_80) $(FC_DEFINE)GPU_DEVICE_Ampere # CUDA version 10.0 @@ -241,6 +247,7 @@ GENCODE_AMD_MI250 = --amdgpu-target=gfx90a @COND_HIP_TRUE@@COND_HIP_CUDA9_TRUE@GENCODE_HIP = $(GENCODE_70) # --with-hip=cuda9 .. @COND_HIP_TRUE@@COND_HIP_CUDA10_TRUE@GENCODE_HIP = $(GENCODE_75) # --with-hip=cuda10 .. @COND_HIP_TRUE@@COND_HIP_CUDA11_TRUE@GENCODE_HIP = $(GENCODE_80) # --with-hip=cuda11 .. +@COND_HIP_TRUE@@COND_HIP_CUDA12_TRUE@GENCODE_HIP = $(GENCODE_90) # --with-hip=cuda12 .. HIP_FLAGS = @HIP_FLAGS@ HIP_INC = @HIP_CPPFLAGS@ $(CUDA_MPI_FLAG) $(MPI_CPPFLAGS) $(MPI_INCLUDES) diff --git a/configure b/configure index ae560f695..7cf0b7320 100755 --- a/configure +++ b/configure @@ -764,6 +764,8 @@ COND_XSMM_FALSE COND_XSMM_TRUE COND_MIC_FALSE COND_MIC_TRUE +COND_HIP_CUDA12_FALSE +COND_HIP_CUDA12_TRUE COND_HIP_CUDA11_FALSE COND_HIP_CUDA11_TRUE COND_HIP_CUDA10_FALSE @@ -796,6 +798,8 @@ COND_CUDA_AWARE_MPI_FALSE COND_CUDA_AWARE_MPI_TRUE COND_CUDA_PLUS_FALSE COND_CUDA_PLUS_TRUE +COND_CUDA12_FALSE +COND_CUDA12_TRUE COND_CUDA11_FALSE COND_CUDA11_TRUE COND_CUDA10_FALSE @@ -3510,6 +3514,14 @@ else COND_CUDA11_FALSE= fi + if test x"$want_cuda" = xcuda12; then + COND_CUDA12_TRUE= + COND_CUDA12_FALSE='#' +else + COND_CUDA12_TRUE='#' + COND_CUDA12_FALSE= +fi + # cuda linking for cuda 5x and 6x and 7x and 8x and .. if test "$want_cuda" = cuda4 \ @@ -3520,6 +3532,7 @@ fi -o "$want_cuda" = cuda9 \ -o "$want_cuda" = cuda10 \ -o "$want_cuda" = cuda11 \ + -o "$want_cuda" = cuda12 \ ; then COND_CUDA_PLUS_TRUE= COND_CUDA_PLUS_FALSE='#' @@ -3696,6 +3709,14 @@ else COND_HIP_CUDA11_FALSE= fi + if test x"$want_hip" = xcuda12; then + COND_HIP_CUDA12_TRUE= + COND_HIP_CUDA12_FALSE='#' +else + COND_HIP_CUDA12_TRUE='#' + COND_HIP_CUDA12_FALSE= +fi + ### ### MIC (Xeon PHI) @@ -11679,6 +11700,10 @@ if test -z "${COND_CUDA11_TRUE}" && test -z "${COND_CUDA11_FALSE}"; then as_fn_error $? "conditional \"COND_CUDA11\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${COND_CUDA12_TRUE}" && test -z "${COND_CUDA12_FALSE}"; then + as_fn_error $? "conditional \"COND_CUDA12\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi if test -z "${COND_CUDA_PLUS_TRUE}" && test -z "${COND_CUDA_PLUS_FALSE}"; then as_fn_error $? "conditional \"COND_CUDA_PLUS\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 @@ -11743,6 +11768,10 @@ if test -z "${COND_HIP_CUDA11_TRUE}" && test -z "${COND_HIP_CUDA11_FALSE}"; then as_fn_error $? "conditional \"COND_HIP_CUDA11\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${COND_HIP_CUDA12_TRUE}" && test -z "${COND_HIP_CUDA12_FALSE}"; then + as_fn_error $? "conditional \"COND_HIP_CUDA12\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi if test -z "${COND_MIC_TRUE}" && test -z "${COND_MIC_FALSE}"; then as_fn_error $? "conditional \"COND_MIC\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 diff --git a/configure.ac b/configure.ac index 724163cd6..e60898e53 100644 --- a/configure.ac +++ b/configure.ac @@ -112,6 +112,7 @@ AM_CONDITIONAL([COND_CUDA8], [test x"$want_cuda" = xcuda8]) AM_CONDITIONAL([COND_CUDA9], [test x"$want_cuda" = xcuda9]) AM_CONDITIONAL([COND_CUDA10], [test x"$want_cuda" = xcuda10]) AM_CONDITIONAL([COND_CUDA11], [test x"$want_cuda" = xcuda11]) +AM_CONDITIONAL([COND_CUDA12], [test x"$want_cuda" = xcuda12]) # cuda linking for cuda 5x and 6x and 7x and 8x and .. AM_CONDITIONAL([COND_CUDA_PLUS], @@ -122,7 +123,8 @@ AM_CONDITIONAL([COND_CUDA_PLUS], -o "$want_cuda" = cuda8 \ -o "$want_cuda" = cuda9 \ -o "$want_cuda" = cuda10 \ - -o "$want_cuda" = cuda11 \] + -o "$want_cuda" = cuda11 \ + -o "$want_cuda" = cuda12 \] ) # CUDA-aware MPI setting @@ -166,6 +168,7 @@ AM_CONDITIONAL([COND_HIP_CUDA8], [test x"$want_hip" = xcuda8]) AM_CONDITIONAL([COND_HIP_CUDA9], [test x"$want_hip" = xcuda9]) AM_CONDITIONAL([COND_HIP_CUDA10], [test x"$want_hip" = xcuda10]) AM_CONDITIONAL([COND_HIP_CUDA11], [test x"$want_hip" = xcuda11]) +AM_CONDITIONAL([COND_HIP_CUDA12], [test x"$want_hip" = xcuda12]) ### ### MIC (Xeon PHI) diff --git a/doc/USER_MANUAL/02_getting_started.tex b/doc/USER_MANUAL/02_getting_started.tex index 6e48ae8a4..5c384a119 100644 --- a/doc/USER_MANUAL/02_getting_started.tex +++ b/doc/USER_MANUAL/02_getting_started.tex @@ -112,6 +112,7 @@ \section{Using the GPU version of the code} - CUDA 9 for Volta, like V100 - CUDA 10 for Turing, like GeForce RTX 2080 - CUDA 11 for Ampere, like A100 + - CUDA 12 for Hopper, like H100 \end{verbatim} } \noindent diff --git a/src/gpu/mesh_constants_cuda.h b/src/gpu/mesh_constants_cuda.h index 9358a805b..cf6221bb3 100644 --- a/src/gpu/mesh_constants_cuda.h +++ b/src/gpu/mesh_constants_cuda.h @@ -85,6 +85,10 @@ static inline void print_CUDA_error_if_any(cudaError_t err, int num) { #if CUSTOM_REAL == 4 // textures +// textures +// note: texture templates are supported only for CUDA versions <= 11.x +// since CUDA 12.x, these are deprecated and texture objects should be used instead +// see: https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ #if defined(USE_TEXTURES_FIELDS) || defined(USE_TEXTURES_CONSTANTS) typedef texture realw_texture; #endif @@ -95,6 +99,10 @@ typedef float* __restrict__ realw_p; // otherwise use: //typedef float* realw_p; #elif CUSTOM_REAL == 8 // textures +// textures +// note: texture templates are supported only for CUDA versions <= 11.x +// since CUDA 12.x, these are deprecated and texture objects should be used instead +// see: https://developer.nvidia.com/blog/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/ #if defined(USE_TEXTURES_FIELDS) || defined(USE_TEXTURES_CONSTANTS) typedef texture realw_texture; #endif diff --git a/src/gpu/mesh_constants_gpu.h b/src/gpu/mesh_constants_gpu.h index b40aff45d..af126018d 100644 --- a/src/gpu/mesh_constants_gpu.h +++ b/src/gpu/mesh_constants_gpu.h @@ -374,6 +374,14 @@ typedef double realw; //#define CUDA_SHARED_ASYNC #endif +#ifdef GPU_DEVICE_Hopper +// specifics see: https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html +// register file size 64k 32-bit registers per SM +// shared memory size 228KB per SM (maximum shared memory, 227KB per thread block) +// maximum registers 255 per thread +#undef USE_LAUNCH_BOUNDS +#endif + // CUDA Graphs #if defined (__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 10) // CUDA graphs: (experimental feature) requires compilation with CUDA toolkit versions >= 10.0 diff --git a/src/gpu/rules.mk b/src/gpu/rules.mk index c9487c0f1..6a858f6db 100644 --- a/src/gpu/rules.mk +++ b/src/gpu/rules.mk @@ -153,7 +153,9 @@ ifeq ($(CUDA),yes) ifeq ($(CUDA11),yes) BUILD_VERSION_TXT += (v11) endif - + ifeq ($(CUDA12),yes) + BUILD_VERSION_TXT += (v12) + endif endif ifeq ($(GPU_CUDA_AND_OCL),yes) From 61a8fcbf3ad6280ec836a690388b1beb7ee0dac5 Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Tue, 30 May 2023 08:56:00 +0200 Subject: [PATCH 06/11] updates element size in regular point gridding --- src/specfem3D/locate_regular_points.f90 | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/specfem3D/locate_regular_points.f90 b/src/specfem3D/locate_regular_points.f90 index f4511b0eb..b77bbcfff 100644 --- a/src/specfem3D/locate_regular_points.f90 +++ b/src/specfem3D/locate_regular_points.f90 @@ -188,7 +188,8 @@ subroutine locate_regular_points(npoints_slice_reg,points_slice_reg,GRID, & use shared_parameters, only: R_PLANET - use specfem_par, only: myrank, NEX_XI + use specfem_par, only: myrank, & + NCHUNKS_VAL,NEX_XI_VAL,NEX_ETA_VAL,ANGULAR_WIDTH_XI_IN_DEGREES_VAL,ANGULAR_WIDTH_ETA_IN_DEGREES_VAL use specfem_par_crustmantle, only: kl_reg_grid_variables @@ -222,13 +223,14 @@ subroutine locate_regular_points(npoints_slice_reg,points_slice_reg,GRID, & integer :: ispec_in, ispec, iter_loop, ia, ipoint double precision :: lat, lon, radius, th, ph, x,y,z double precision :: x_target, y_target, z_target - double precision :: distmin_squared,dist_squared,typical_size_squared + double precision :: distmin_squared,dist_squared + double precision :: typical_size_squared,element_size double precision :: xi,eta,gamma,dx,dy,dz,dxi,deta,dgamma double precision :: xix,xiy,xiz double precision :: etax,etay,etaz double precision :: gammax,gammay,gammaz - - logical locate_target + double precision :: ANGULAR_WIDTH_XI_RAD,ANGULAR_WIDTH_ETA_RAD + logical :: locate_target double precision, dimension(NGNOD) :: xelm, yelm, zelm double precision, dimension(NGLLX) :: hxir @@ -240,10 +242,19 @@ subroutine locate_regular_points(npoints_slice_reg,points_slice_reg,GRID, & call hex_nodes_anchor_ijk(anchor_iax,anchor_iay,anchor_iaz) ! compute typical size of elements at the surface - typical_size_squared = TWO_PI * R_UNIT_SPHERE / (4.0 * NEX_XI) + ! (normalized) + if (NCHUNKS_VAL == 6) then + ! estimation for global meshes (assuming 90-degree chunks) + element_size = TWO_PI * R_UNIT_SPHERE / (4.d0 * NEX_XI_VAL) + else + ! estimation for 1-chunk meshes + ANGULAR_WIDTH_XI_RAD = ANGULAR_WIDTH_XI_IN_DEGREES_VAL * DEGREES_TO_RADIANS + ANGULAR_WIDTH_ETA_RAD = ANGULAR_WIDTH_ETA_IN_DEGREES_VAL * DEGREES_TO_RADIANS + element_size = max( ANGULAR_WIDTH_XI_RAD/NEX_XI_VAL,ANGULAR_WIDTH_ETA_RAD/NEX_ETA_VAL ) * R_UNIT_SPHERE + endif - ! use 10 times the distance as a criterion for source detection - typical_size_squared = (10.0 * typical_size_squared)**2 + ! use 10 times the distance as a criterion for point detections + typical_size_squared = (10.d0 * element_size)**2 do ipoint = 1, npoints_slice_reg isp = points_slice_reg(ipoint) From 43161bf2dabee72cb8b25d0f0eb2f2ba9f5c9337 Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Tue, 30 May 2023 12:58:06 +0200 Subject: [PATCH 07/11] updates parameter usage --- src/create_header_file/create_header_file.f90 | 1 - src/meshfem3D/gravity_integrals.F90 | 7 ++++++- src/meshfem3D/meshfem3D_par.f90 | 13 ++++++++++++- src/meshfem3D/write_AVS_DX_output_adios.f90 | 2 +- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/create_header_file/create_header_file.f90 b/src/create_header_file/create_header_file.f90 index 75188f9f8..9a3c42a34 100644 --- a/src/create_header_file/create_header_file.f90 +++ b/src/create_header_file/create_header_file.f90 @@ -31,7 +31,6 @@ program xcreate_header_file use shared_parameters - use constants implicit none diff --git a/src/meshfem3D/gravity_integrals.F90 b/src/meshfem3D/gravity_integrals.F90 index 28cb2980f..ab0bdb345 100644 --- a/src/meshfem3D/gravity_integrals.F90 +++ b/src/meshfem3D/gravity_integrals.F90 @@ -35,7 +35,11 @@ subroutine gravity_initialize_integrals() ! initializes integrals - use constants + use constants, only: myrank,IMAIN,ZERO,CUSTOM_REAL,SIZE_DOUBLE, & + GRAVITY_INTEGRALS,REUSE_EXISTING_OBSERVATION_SURF, & + ONLY_COMPUTE_CENTER_OF_MASS,COMPUTE_CRUST_CONTRIB_ONLY,SHIFT_TO_THIS_CENTER_OF_MASS, & + NX_OBSERVATION,NY_OBSERVATION, & + x_shift,y_shift,z_shift use meshfem_par, only: g_x,g_y,g_z,G_xx,G_yy,G_zz,G_xy,G_xz,G_yz @@ -518,6 +522,7 @@ end subroutine gravity_compute_integrals subroutine finalize_gravity_integrals() + use constants, only: SI_UNITS_TO_EOTVOS,IXR,IYR,ICHUNKR,ONLY_COMPUTE_CENTER_OF_MASS use meshfem_par use meshfem_models_par diff --git a/src/meshfem3D/meshfem3D_par.f90 b/src/meshfem3D/meshfem3D_par.f90 index 7a6b0fd84..1ffd0a0b1 100644 --- a/src/meshfem3D/meshfem3D_par.f90 +++ b/src/meshfem3D/meshfem3D_par.f90 @@ -110,7 +110,18 @@ module meshfem_par ! main parameter module for specfem simulations - use constants + use constants, only: myrank,CUSTOM_REAL,IMAIN,ISTANDARD_OUTPUT,IOUT,OUTPUT_FILES_BASE, & + ZERO,TINYVAL,GRAV,R_UNIT_SPHERE,SIZE_REAL, & + NDIM,NGLLX,NGLLY,NGLLZ, & + NX_OBSERVATION,NY_OBSERVATION,NCHUNKS_MAX, & + NUMFACES_SHARED,NUMCORNERS_SHARED, & + MAX_NUM_REGIONS,NB_SQUARE_CORNERS,NB_SQUARE_EDGES_ONEDIR, & + USE_MESH_COLORING_GPU,MAX_NUMBER_OF_COLORS, & + GRAVITY_INTEGRALS, & + SUPPRESS_CRUSTAL_MESH,SUPPRESS_MOHO_STRETCHING,SUPPRESS_INTERNAL_TOPOGRAPHY, & + IREGION_CRUST_MANTLE,IREGION_OUTER_CORE,IREGION_INNER_CORE,IFLAG_IN_FICTITIOUS_CUBE, & + THREE_D_MODEL_S362ANI,THREE_D_MODEL_S362WMANI,THREE_D_MODEL_S362ANI_PREM,THREE_D_MODEL_S29EA, & + THREE_D_MODEL_MANTLE_SH,THREE_D_MODEL_SPIRAL use shared_parameters diff --git a/src/meshfem3D/write_AVS_DX_output_adios.f90 b/src/meshfem3D/write_AVS_DX_output_adios.f90 index 6f33dcf36..aad362424 100644 --- a/src/meshfem3D/write_AVS_DX_output_adios.f90 +++ b/src/meshfem3D/write_AVS_DX_output_adios.f90 @@ -35,7 +35,7 @@ subroutine write_AVS_DX_output_adios(npointot,iregion_code, & myrank,NGLLX,NGLLY,NGLLZ, & RICB,RCMB,RTOPDDOUBLEPRIME,R670,R220,R771,R400,R120,R80,RMOHO, & RMIDDLE_CRUST, & - LOCAL_PATH,IMAIN,ADIOS_TRANSPORT_METHOD + LOCAL_PATH,IMAIN use meshfem_models_par, only: & ELLIPTICITY,MODEL_3D_MANTLE_PERTUBATIONS, & From ab38c310fe32b82b2f45522e38da9766a614ea54 Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Wed, 31 May 2023 14:04:03 +0200 Subject: [PATCH 08/11] fixes compilation warnings --- src/meshfem3D/model_sh_mars.f90 | 1 + src/specfem3D/prepare_optimized_arrays.F90 | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/meshfem3D/model_sh_mars.f90 b/src/meshfem3D/model_sh_mars.f90 index f155b4b61..fab8b0afe 100644 --- a/src/meshfem3D/model_sh_mars.f90 +++ b/src/meshfem3D/model_sh_mars.f90 @@ -222,6 +222,7 @@ subroutine read_SH_mars_model() ! parameter type ! converts all string characters to lowercase (to make user input case-insensitive) + irange = iachar('a') - iachar('A') do i = 1,len_trim(substring) if (lge(substring(i:i),'A') .and. lle(substring(i:i),'Z')) then substring(i:i) = achar(iachar(substring(i:i)) + irange) diff --git a/src/specfem3D/prepare_optimized_arrays.F90 b/src/specfem3D/prepare_optimized_arrays.F90 index 63e0e6538..de08bd23e 100644 --- a/src/specfem3D/prepare_optimized_arrays.F90 +++ b/src/specfem3D/prepare_optimized_arrays.F90 @@ -78,7 +78,7 @@ subroutine prepare_timerun_ibool_inv_tbl() ! local parameters integer :: iphase,ier integer :: num_elements - integer,dimension(1) :: idummy + integer,dimension(1) :: idummy = (/ 0 /) ! inverse arrays use 1D indexing for better compiler vectorization ! only used for Deville routines and FORCE_VECTORIZATION) From 71f9313ad1316d357af78d3cfb7956c111bfaaa1 Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Thu, 1 Jun 2023 09:17:00 +0200 Subject: [PATCH 09/11] avoids inlining mxm5_* routines in compute forces calls by Cray compilers (due to problems w/ -O3 optimization) --- .../compute_forces_crust_mantle_Dev.F90 | 24 +- .../compute_forces_inner_core_Dev.F90 | 36 +- .../compute_forces_outer_core_Dev.F90 | 498 ++++++++++-------- 3 files changed, 322 insertions(+), 236 deletions(-) diff --git a/src/specfem3D/compute_forces_crust_mantle_Dev.F90 b/src/specfem3D/compute_forces_crust_mantle_Dev.F90 index 7a7f1c6f3..c0403cdb7 100644 --- a/src/specfem3D/compute_forces_crust_mantle_Dev.F90 +++ b/src/specfem3D/compute_forces_crust_mantle_Dev.F90 @@ -564,7 +564,11 @@ subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3) !DIR$ ATTRIBUTES FORCEINLINE :: mxm5_3comp_singleA #else ! cray -!DIR$ INLINEALWAYS mxm5_3comp_singleA +! note: with Cray Fortran versions >= 14 on Frontier, inlining this routine together with optimization -O3 leads to problems. +! for now, will avoid inlining by this directive INLINENEVER to allow for default compilation, +! otherwise the compilation flag -hipa0 would need to be added to suppress all inlining as well. +!!DIR$ INLINEALWAYS mxm5_3comp_singleA +!DIR$ INLINENEVER mxm5_3comp_singleA #endif ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays @@ -603,7 +607,9 @@ subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3) ! matrix-matrix multiplication do j = 1,n3 !DIR$ IVDEP +#if defined __INTEL_COMPILER !DIR$ SIMD +#endif do i = 1,n1 C1(i,j) = A(i,1) * B1(1,j) & + A(i,2) * B1(2,j) & @@ -637,7 +643,11 @@ subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3) !DIR$ ATTRIBUTES FORCEINLINE :: mxm5_3comp_singleB #else ! cray -!DIR$ INLINEALWAYS mxm5_3comp_singleB +! note: with Cray Fortran versions >= 14 on Frontier, inlining this routine together with optimization -O3 leads to problems. +! for now, will avoid inlining by this directive INLINENEVER to allow for default compilation, +! otherwise the compilation flag -hipa0 would need to be added to suppress all inlining as well. +!!DIR$ INLINEALWAYS mxm5_3comp_singleB +!DIR$ INLINENEVER mxm5_3comp_singleB #endif ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays @@ -676,7 +686,9 @@ subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3) ! matrix-matrix multiplication do j = 1,n3 !DIR$ IVDEP +#if defined __INTEL_COMPILER !DIR$ SIMD +#endif do i = 1,n1 C1(i,j) = A1(i,1) * B(1,j) & + A1(i,2) * B(2,j) & @@ -710,7 +722,11 @@ subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3) !DIR$ ATTRIBUTES FORCEINLINE :: mxm5_3comp_3dmat_singleB #else ! cray -!DIR$ INLINEALWAYS mxm5_3comp_3dmat_singleB +! note: with Cray Fortran versions >= 14 on Frontier, inlining this routine together with optimization -O3 leads to problems. +! for now, will avoid inlining by this directive INLINENEVER to allow for default compilation, +! otherwise the compilation flag -hipa0 would need to be added to suppress all inlining as well. +!!DIR$ INLINEALWAYS mxm5_3comp_3dmat_singleB +!DIR$ INLINENEVER mxm5_3comp_3dmat_singleB #endif ! 3 different arrays for x/y/z-components, 3-dimensional arrays (5,5,5), same B matrix for all 3 component arrays @@ -774,7 +790,9 @@ subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3) do k = 1,n3 do j = 1,n2 !DIR$ IVDEP +#if defined __INTEL_COMPILER !DIR$ SIMD +#endif do i = 1,n1 C1(i,j,k) = A1(i,1,k) * B(1,j) & + A1(i,2,k) * B(2,j) & diff --git a/src/specfem3D/compute_forces_inner_core_Dev.F90 b/src/specfem3D/compute_forces_inner_core_Dev.F90 index b96992499..833ad2d9d 100644 --- a/src/specfem3D/compute_forces_inner_core_Dev.F90 +++ b/src/specfem3D/compute_forces_inner_core_Dev.F90 @@ -431,7 +431,11 @@ subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3) !DIR$ ATTRIBUTES FORCEINLINE :: mxm5_3comp_singleA #else ! cray -!DIR$ INLINEALWAYS mxm5_3comp_singleA +! note: with Cray Fortran versions >= 14 on Frontier, inlining this routine together with optimization -O3 leads to problems. +! for now, will avoid inlining by this directive INLINENEVER to allow for default compilation, +! otherwise the compilation flag -hipa0 would need to be added to suppress all inlining as well. +!!DIR$ INLINEALWAYS mxm5_3comp_singleA +!DIR$ INLINENEVER mxm5_3comp_singleA #endif ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays @@ -469,8 +473,10 @@ subroutine mxm5_3comp_singleA(A,n1,B1,B2,B3,C1,C2,C3,n3) ! matrix-matrix multiplication do j = 1,n3 -!dir$ ivdep -!dir$ SIMD +!DIR$ IVDEP +#if defined __INTEL_COMPILER +!DIR$ SIMD +#endif do i = 1,n1 C1(i,j) = A(i,1) * B1(1,j) & + A(i,2) * B1(2,j) & @@ -504,7 +510,11 @@ subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3) !DIR$ ATTRIBUTES FORCEINLINE :: mxm5_3comp_singleB #else ! cray -!DIR$ INLINEALWAYS mxm5_3comp_singleB +! note: with Cray Fortran versions >= 14 on Frontier, inlining this routine together with optimization -O3 leads to problems. +! for now, will avoid inlining by this directive INLINENEVER to allow for default compilation, +! otherwise the compilation flag -hipa0 would need to be added to suppress all inlining as well. +!!DIR$ INLINEALWAYS mxm5_3comp_singleB +!DIR$ INLINENEVER mxm5_3comp_singleB #endif ! 3 different arrays for x/y/z-components, 2-dimensional arrays (25,5)/(5,25), same B matrix for all 3 component arrays @@ -542,8 +552,10 @@ subroutine mxm5_3comp_singleB(A1,A2,A3,n1,B,C1,C2,C3,n3) ! matrix-matrix multiplication do j = 1,n3 -!dir$ ivdep -!dir$ SIMD +!DIR$ IVDEP +#if defined __INTEL_COMPILER +!DIR$ SIMD +#endif do i = 1,n1 C1(i,j) = A1(i,1) * B(1,j) & + A1(i,2) * B(2,j) & @@ -577,7 +589,11 @@ subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3) !DIR$ ATTRIBUTES FORCEINLINE :: mxm5_3comp_3dmat_singleB #else ! cray -!DIR$ INLINEALWAYS mxm5_3comp_3dmat_singleB +! note: with Cray Fortran versions >= 14 on Frontier, inlining this routine together with optimization -O3 leads to problems. +! for now, will avoid inlining by this directive INLINENEVER to allow for default compilation, +! otherwise the compilation flag -hipa0 would need to be added to suppress all inlining as well. +!!DIR$ INLINEALWAYS mxm5_3comp_3dmat_singleB +!DIR$ INLINENEVER mxm5_3comp_3dmat_singleB #endif ! 3 different arrays for x/y/z-components, 3-dimensional arrays (5,5,5), same B matrix for all 3 component arrays @@ -640,8 +656,10 @@ subroutine mxm5_3comp_3dmat_singleB(A1,A2,A3,n1,B,n2,C1,C2,C3,n3) ! matrix-matrix multiplication do k = 1,n3 do j = 1,n2 -!dir$ ivdep -!dir$ SIMD +!DIR$ IVDEP +#if defined __INTEL_COMPILER +!DIR$ SIMD +#endif do i = 1,n1 C1(i,j,k) = A1(i,1,k) * B(1,j) & + A1(i,2,k) * B(2,j) & diff --git a/src/specfem3D/compute_forces_outer_core_Dev.F90 b/src/specfem3D/compute_forces_outer_core_Dev.F90 index 709b11daa..216819a23 100644 --- a/src/specfem3D/compute_forces_outer_core_Dev.F90 +++ b/src/specfem3D/compute_forces_outer_core_Dev.F90 @@ -211,26 +211,32 @@ subroutine compute_forces_outer_core_Dev(timeval,deltat,two_omega_earth, & ! computes 1. matrix multiplication for temp1 ! computes 2. matrix multiplication for temp2 ! computes 3. matrix multiplication for temp3 - select case (NGLLX) - case (5) - call mxm5_single(hprime_xx,m1,chi_elem,temp1,m2) - call mxm5_3dmat_single(chi_elem,m1,hprime_xxT,m1,temp2,NGLLX) - call mxm5_single(chi_elem,m2,hprime_xxT,temp3,m1) - case (6) - call mxm6_single(hprime_xx,m1,chi_elem,temp1,m2) - call mxm6_3dmat_single(chi_elem,m1,hprime_xxT,m1,temp2,NGLLX) - call mxm6_single(chi_elem,m2,hprime_xxT,temp3,m1) - case (7) - call mxm7_single(hprime_xx,m1,chi_elem,temp1,m2) - call mxm7_3dmat_single(chi_elem,m1,hprime_xxT,m1,temp2,NGLLX) - call mxm7_single(chi_elem,m2,hprime_xxT,temp3,m1) - case (8) - call mxm8_single(hprime_xx,m1,chi_elem,temp1,m2) - call mxm8_3dmat_single(chi_elem,m1,hprime_xxT,m1,temp2,NGLLX) - call mxm8_single(chi_elem,m2,hprime_xxT,temp3,m1) - end select - - + call mxm5_single(hprime_xx,m1,chi_elem,temp1,m2) + call mxm5_3dmat_single(chi_elem,m1,hprime_xxT,m1,temp2,NGLLX) + call mxm5_single(chi_elem,m2,hprime_xxT,temp3,m1) + + ! note: this compute_forces_outer_core_Dev() routine is called for USE_DEVILLE_PRODUCTS_VAL == .true. + ! which is only the case for NGLLX == NGLLY == NGLLZ == 5 + ! + ! for more general cases one could do the following: + !select case (NGLLX) + !case (5) + ! call mxm5_single(hprime_xx,m1,chi_elem,temp1,m2) + ! call mxm5_3dmat_single(chi_elem,m1,hprime_xxT,m1,temp2,NGLLX) + ! call mxm5_single(chi_elem,m2,hprime_xxT,temp3,m1) + !case (6) + ! call mxm6_single(hprime_xx,m1,chi_elem,temp1,m2) + ! call mxm6_3dmat_single(chi_elem,m1,hprime_xxT,m1,temp2,NGLLX) + ! call mxm6_single(chi_elem,m2,hprime_xxT,temp3,m1) + !case (7) + ! call mxm7_single(hprime_xx,m1,chi_elem,temp1,m2) + ! call mxm7_3dmat_single(chi_elem,m1,hprime_xxT,m1,temp2,NGLLX) + ! call mxm7_single(chi_elem,m2,hprime_xxT,temp3,m1) + !case (8) + ! call mxm8_single(hprime_xx,m1,chi_elem,temp1,m2) + ! call mxm8_3dmat_single(chi_elem,m1,hprime_xxT,m1,temp2,NGLLX) + ! call mxm8_single(chi_elem,m2,hprime_xxT,temp3,m1) + !end select DO_LOOP_IJK ! get derivatives of potential with respect to x, y and z @@ -391,24 +397,32 @@ subroutine compute_forces_outer_core_Dev(timeval,deltat,two_omega_earth, & ! computes 1. matrix multiplication for newtemp1 ! computes 2. matrix multiplication for newtemp2 ! computes 3. matrix multiplication for newtemp3 - select case (NGLLX) - case (5) - call mxm5_single(hprimewgll_xxT,m1,temp1,newtemp1,m2) - call mxm5_3dmat_single(temp2,m1,hprimewgll_xx,m1,newtemp2,NGLLX) - call mxm5_single(temp3,m2,hprimewgll_xx,newtemp3,m1) - case (6) - call mxm6_single(hprimewgll_xxT,m1,temp1,newtemp1,m2) - call mxm6_3dmat_single(temp2,m1,hprimewgll_xx,m1,newtemp2,NGLLX) - call mxm6_single(temp3,m2,hprimewgll_xx,newtemp3,m1) - case (7) - call mxm7_single(hprimewgll_xxT,m1,temp1,newtemp1,m2) - call mxm7_3dmat_single(temp2,m1,hprimewgll_xx,m1,newtemp2,NGLLX) - call mxm7_single(temp3,m2,hprimewgll_xx,newtemp3,m1) - case (8) - call mxm8_single(hprimewgll_xxT,m1,temp1,newtemp1,m2) - call mxm8_3dmat_single(temp2,m1,hprimewgll_xx,m1,newtemp2,NGLLX) - call mxm8_single(temp3,m2,hprimewgll_xx,newtemp3,m1) - end select + call mxm5_single(hprimewgll_xxT,m1,temp1,newtemp1,m2) + call mxm5_3dmat_single(temp2,m1,hprimewgll_xx,m1,newtemp2,NGLLX) + call mxm5_single(temp3,m2,hprimewgll_xx,newtemp3,m1) + + ! note: this compute_forces_outer_core_Dev() routine is called for USE_DEVILLE_PRODUCTS_VAL == .true. + ! which is only the case for NGLLX == NGLLY == NGLLZ == 5 + ! + ! for more general cases one could do the following: + !select case (NGLLX) + !case (5) + ! call mxm5_single(hprimewgll_xxT,m1,temp1,newtemp1,m2) + ! call mxm5_3dmat_single(temp2,m1,hprimewgll_xx,m1,newtemp2,NGLLX) + ! call mxm5_single(temp3,m2,hprimewgll_xx,newtemp3,m1) + !case (6) + ! call mxm6_single(hprimewgll_xxT,m1,temp1,newtemp1,m2) + ! call mxm6_3dmat_single(temp2,m1,hprimewgll_xx,m1,newtemp2,NGLLX) + ! call mxm6_single(temp3,m2,hprimewgll_xx,newtemp3,m1) + !case (7) + ! call mxm7_single(hprimewgll_xxT,m1,temp1,newtemp1,m2) + ! call mxm7_3dmat_single(temp2,m1,hprimewgll_xx,m1,newtemp2,NGLLX) + ! call mxm7_single(temp3,m2,hprimewgll_xx,newtemp3,m1) + !case (8) + ! call mxm8_single(hprimewgll_xxT,m1,temp1,newtemp1,m2) + ! call mxm8_3dmat_single(temp2,m1,hprimewgll_xx,m1,newtemp2,NGLLX) + ! call mxm8_single(temp3,m2,hprimewgll_xx,newtemp3,m1) + !end select ! sum contributions from each element to the global mesh and add gravity term DO_LOOP_IJK @@ -500,6 +514,18 @@ subroutine compute_forces_outer_core_Dev(timeval,deltat,two_omega_earth, & subroutine mxm5_single(A,n1,B,C,n3) +! we can force inlining (Intel compiler) +#if defined __INTEL_COMPILER +!DIR$ ATTRIBUTES FORCEINLINE :: mxm5_single +#else +! cray +! note: with Cray Fortran versions >= 14 on Frontier, inlining this routine together with optimization -O3 leads to problems. +! for now, will avoid inlining by this directive INLINENEVER to allow for default compilation, +! otherwise the compilation flag -hipa0 would need to be added to suppress all inlining as well. +!!DIR$ INLINEALWAYS mxm5_single +!DIR$ INLINENEVER mxm5_single +#endif + ! 2-dimensional arrays (25,5)/(5,25) use constants_solver, only: CUSTOM_REAL @@ -554,107 +580,125 @@ end subroutine mxm5_single !------------- - subroutine mxm6_single(A,n1,B,C,n3) - -! two-dimensional arrays (36,6)/(6,36) - - use constants, only: CUSTOM_REAL - - implicit none - - integer,intent(in) :: n1,n3 - real(kind=CUSTOM_REAL),dimension(n1,6),intent(in) :: A - real(kind=CUSTOM_REAL),dimension(6,n3),intent(in) :: B - real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C - - ! local parameters - integer :: i,j - - ! matrix-matrix multiplication - do j = 1,n3 - do i = 1,n1 - C(i,j) = A(i,1) * B(1,j) & - + A(i,2) * B(2,j) & - + A(i,3) * B(3,j) & - + A(i,4) * B(4,j) & - + A(i,5) * B(5,j) & - + A(i,6) * B(6,j) - enddo - enddo - - end subroutine mxm6_single +! unused so far.. +! +! subroutine mxm6_single(A,n1,B,C,n3) +! +!! two-dimensional arrays (36,6)/(6,36) +! +! use constants, only: CUSTOM_REAL +! +! implicit none +! +! integer,intent(in) :: n1,n3 +! real(kind=CUSTOM_REAL),dimension(n1,6),intent(in) :: A +! real(kind=CUSTOM_REAL),dimension(6,n3),intent(in) :: B +! real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C +! +! ! local parameters +! integer :: i,j +! +! ! matrix-matrix multiplication +! do j = 1,n3 +! do i = 1,n1 +! C(i,j) = A(i,1) * B(1,j) & +! + A(i,2) * B(2,j) & +! + A(i,3) * B(3,j) & +! + A(i,4) * B(4,j) & +! + A(i,5) * B(5,j) & +! + A(i,6) * B(6,j) +! enddo +! enddo +! +! end subroutine mxm6_single !------------- - subroutine mxm7_single(A,n1,B,C,n3) - -! two-dimensional arrays (49,7)/(7,49) - - use constants, only: CUSTOM_REAL - - implicit none - - integer,intent(in) :: n1,n3 - real(kind=CUSTOM_REAL),dimension(n1,7),intent(in) :: A - real(kind=CUSTOM_REAL),dimension(7,n3),intent(in) :: B - real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C - - ! local parameters - integer :: i,j - - ! matrix-matrix multiplication - do j = 1,n3 - do i = 1,n1 - C(i,j) = A(i,1) * B(1,j) & - + A(i,2) * B(2,j) & - + A(i,3) * B(3,j) & - + A(i,4) * B(4,j) & - + A(i,5) * B(5,j) & - + A(i,6) * B(6,j) & - + A(i,7) * B(7,j) - enddo - enddo - - end subroutine mxm7_single +! unused so far.. +! +! subroutine mxm7_single(A,n1,B,C,n3) +! +!! two-dimensional arrays (49,7)/(7,49) +! +! use constants, only: CUSTOM_REAL +! +! implicit none +! +! integer,intent(in) :: n1,n3 +! real(kind=CUSTOM_REAL),dimension(n1,7),intent(in) :: A +! real(kind=CUSTOM_REAL),dimension(7,n3),intent(in) :: B +! real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C +! +! ! local parameters +! integer :: i,j +! +! ! matrix-matrix multiplication +! do j = 1,n3 +! do i = 1,n1 +! C(i,j) = A(i,1) * B(1,j) & +! + A(i,2) * B(2,j) & +! + A(i,3) * B(3,j) & +! + A(i,4) * B(4,j) & +! + A(i,5) * B(5,j) & +! + A(i,6) * B(6,j) & +! + A(i,7) * B(7,j) +! enddo +! enddo +! +! end subroutine mxm7_single !------------- - subroutine mxm8_single(A,n1,B,C,n3) - -! two-dimensional arrays (64,8)/(8,64) - - use constants, only: CUSTOM_REAL - - implicit none - - integer,intent(in) :: n1,n3 - real(kind=CUSTOM_REAL),dimension(n1,8),intent(in) :: A - real(kind=CUSTOM_REAL),dimension(8,n3),intent(in) :: B - real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C - - ! local parameters - integer :: i,j - - ! matrix-matrix multiplication - do j = 1,n3 - do i = 1,n1 - C(i,j) = A(i,1) * B(1,j) & - + A(i,2) * B(2,j) & - + A(i,3) * B(3,j) & - + A(i,4) * B(4,j) & - + A(i,5) * B(5,j) & - + A(i,6) * B(6,j) & - + A(i,7) * B(7,j) & - + A(i,8) * B(8,j) - enddo - enddo - - end subroutine mxm8_single +! unused so far.. +! +! subroutine mxm8_single(A,n1,B,C,n3) +! +!! two-dimensional arrays (64,8)/(8,64) +! +! use constants, only: CUSTOM_REAL +! +! implicit none +! +! integer,intent(in) :: n1,n3 +! real(kind=CUSTOM_REAL),dimension(n1,8),intent(in) :: A +! real(kind=CUSTOM_REAL),dimension(8,n3),intent(in) :: B +! real(kind=CUSTOM_REAL),dimension(n1,n3),intent(out) :: C +! +! ! local parameters +! integer :: i,j +! +! ! matrix-matrix multiplication +! do j = 1,n3 +! do i = 1,n1 +! C(i,j) = A(i,1) * B(1,j) & +! + A(i,2) * B(2,j) & +! + A(i,3) * B(3,j) & +! + A(i,4) * B(4,j) & +! + A(i,5) * B(5,j) & +! + A(i,6) * B(6,j) & +! + A(i,7) * B(7,j) & +! + A(i,8) * B(8,j) +! enddo +! enddo +! +! end subroutine mxm8_single !-------------------------------------------------------------------------------------------- subroutine mxm5_3dmat_single(A,n1,B,n2,C,n3) +! we can force inlining (Intel compiler) +#if defined __INTEL_COMPILER +!DIR$ ATTRIBUTES FORCEINLINE :: mxm5_3dmat_single +#else +! cray +! note: with Cray Fortran versions >= 14 on Frontier, inlining this routine together with optimization -O3 leads to problems. +! for now, will avoid inlining by this directive INLINENEVER to allow for default compilation, +! otherwise the compilation flag -hipa0 would need to be added to suppress all inlining as well. +!!DIR$ INLINEALWAYS mxm5_3dmat_single +!DIR$ INLINENEVER mxm5_3dmat_single +#endif + ! 3-dimensional arrays (5,5,5) for A and C use constants_solver, only: CUSTOM_REAL @@ -714,108 +758,114 @@ end subroutine mxm5_3dmat_single !------------- - subroutine mxm6_3dmat_single(A,n1,B,n2,C,n3) - -! three-dimensional arrays (6,6,6) for A and C - - use constants, only: CUSTOM_REAL - - implicit none - - integer,intent(in) :: n1,n2,n3 - real(kind=CUSTOM_REAL),dimension(n1,6,n3),intent(in) :: A - real(kind=CUSTOM_REAL),dimension(6,n2),intent(in) :: B - real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out) :: C - - ! local parameters - integer :: i,j,k - - ! matrix-matrix multiplication - do k = 1,n3 - do j = 1,n2 - do i = 1,n1 - C(i,j,k) = A(i,1,k) * B(1,j) & - + A(i,2,k) * B(2,j) & - + A(i,3,k) * B(3,j) & - + A(i,4,k) * B(4,j) & - + A(i,5,k) * B(5,j) & - + A(i,6,k) * B(6,j) - enddo - enddo - enddo - - end subroutine mxm6_3dmat_single +! unused so far.. +! +! subroutine mxm6_3dmat_single(A,n1,B,n2,C,n3) +! +!! three-dimensional arrays (6,6,6) for A and C +! +! use constants, only: CUSTOM_REAL +! +! implicit none +! +! integer,intent(in) :: n1,n2,n3 +! real(kind=CUSTOM_REAL),dimension(n1,6,n3),intent(in) :: A +! real(kind=CUSTOM_REAL),dimension(6,n2),intent(in) :: B +! real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out) :: C +! +! ! local parameters +! integer :: i,j,k +! +! ! matrix-matrix multiplication +! do k = 1,n3 +! do j = 1,n2 +! do i = 1,n1 +! C(i,j,k) = A(i,1,k) * B(1,j) & +! + A(i,2,k) * B(2,j) & +! + A(i,3,k) * B(3,j) & +! + A(i,4,k) * B(4,j) & +! + A(i,5,k) * B(5,j) & +! + A(i,6,k) * B(6,j) +! enddo +! enddo +! enddo +! +! end subroutine mxm6_3dmat_single !------------- - subroutine mxm7_3dmat_single(A,n1,B,n2,C,n3) - -! three-dimensional arrays (7,7,7) for A and C - - use constants, only: CUSTOM_REAL - - implicit none - - integer,intent(in) :: n1,n2,n3 - real(kind=CUSTOM_REAL),dimension(n1,7,n3),intent(in) :: A - real(kind=CUSTOM_REAL),dimension(7,n2),intent(in) :: B - real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out) :: C - - ! local parameters - integer :: i,j,k - - ! matrix-matrix multiplication - do k = 1,n3 - do j = 1,n2 - do i = 1,n1 - C(i,j,k) = A(i,1,k) * B(1,j) & - + A(i,2,k) * B(2,j) & - + A(i,3,k) * B(3,j) & - + A(i,4,k) * B(4,j) & - + A(i,5,k) * B(5,j) & - + A(i,6,k) * B(6,j) & - + A(i,7,k) * B(7,j) - enddo - enddo - enddo - - end subroutine mxm7_3dmat_single +! unused so far.. +! +! subroutine mxm7_3dmat_single(A,n1,B,n2,C,n3) +! +!! three-dimensional arrays (7,7,7) for A and C +! +! use constants, only: CUSTOM_REAL +! +! implicit none +! +! integer,intent(in) :: n1,n2,n3 +! real(kind=CUSTOM_REAL),dimension(n1,7,n3),intent(in) :: A +! real(kind=CUSTOM_REAL),dimension(7,n2),intent(in) :: B +! real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out) :: C +! +! ! local parameters +! integer :: i,j,k +! +! ! matrix-matrix multiplication +! do k = 1,n3 +! do j = 1,n2 +! do i = 1,n1 +! C(i,j,k) = A(i,1,k) * B(1,j) & +! + A(i,2,k) * B(2,j) & +! + A(i,3,k) * B(3,j) & +! + A(i,4,k) * B(4,j) & +! + A(i,5,k) * B(5,j) & +! + A(i,6,k) * B(6,j) & +! + A(i,7,k) * B(7,j) +! enddo +! enddo +! enddo +! +! end subroutine mxm7_3dmat_single !------------- - subroutine mxm8_3dmat_single(A,n1,B,n2,C,n3) - -! three-dimensional arrays (8,8,8) for A and C - - use constants, only: CUSTOM_REAL - - implicit none - - integer,intent(in) :: n1,n2,n3 - real(kind=CUSTOM_REAL),dimension(n1,8,n3),intent(in) :: A - real(kind=CUSTOM_REAL),dimension(8,n2),intent(in) :: B - real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out) :: C - - ! local parameters - integer :: i,j,k - - ! matrix-matrix multiplication - do k = 1,n3 - do j = 1,n2 - do i = 1,n1 - C(i,j,k) = A(i,1,k) * B(1,j) & - + A(i,2,k) * B(2,j) & - + A(i,3,k) * B(3,j) & - + A(i,4,k) * B(4,j) & - + A(i,5,k) * B(5,j) & - + A(i,6,k) * B(6,j) & - + A(i,7,k) * B(7,j) & - + A(i,8,k) * B(8,j) - enddo - enddo - enddo - - end subroutine mxm8_3dmat_single +! unused so far.. +! +! subroutine mxm8_3dmat_single(A,n1,B,n2,C,n3) +! +!! three-dimensional arrays (8,8,8) for A and C +! +! use constants, only: CUSTOM_REAL +! +! implicit none +! +! integer,intent(in) :: n1,n2,n3 +! real(kind=CUSTOM_REAL),dimension(n1,8,n3),intent(in) :: A +! real(kind=CUSTOM_REAL),dimension(8,n2),intent(in) :: B +! real(kind=CUSTOM_REAL),dimension(n1,n2,n3),intent(out) :: C +! +! ! local parameters +! integer :: i,j,k +! +! ! matrix-matrix multiplication +! do k = 1,n3 +! do j = 1,n2 +! do i = 1,n1 +! C(i,j,k) = A(i,1,k) * B(1,j) & +! + A(i,2,k) * B(2,j) & +! + A(i,3,k) * B(3,j) & +! + A(i,4,k) * B(4,j) & +! + A(i,5,k) * B(5,j) & +! + A(i,6,k) * B(6,j) & +! + A(i,7,k) * B(7,j) & +! + A(i,8,k) * B(8,j) +! enddo +! enddo +! enddo +! +! end subroutine mxm8_3dmat_single end subroutine compute_forces_outer_core_Dev From 7317e3d1a08bbc828d5f2eb7147fd0c6942575d4 Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Fri, 2 Jun 2023 16:59:57 +0200 Subject: [PATCH 10/11] updates initializations --- src/meshfem3D/create_MPI_interfaces.f90 | 133 +++++++++++++----- src/meshfem3D/create_central_cube_buffers.f90 | 2 +- src/meshfem3D/create_regions_mesh.F90 | 2 +- src/meshfem3D/fix_non_blocking_flags.f90 | 23 +-- 4 files changed, 113 insertions(+), 47 deletions(-) diff --git a/src/meshfem3D/create_MPI_interfaces.f90 b/src/meshfem3D/create_MPI_interfaces.f90 index 6de621472..ccefcf45b 100644 --- a/src/meshfem3D/create_MPI_interfaces.f90 +++ b/src/meshfem3D/create_MPI_interfaces.f90 @@ -45,11 +45,14 @@ end subroutine create_MPI_interfaces subroutine cmi_allocate_addressing(iregion_code) - use meshfem_par, only: myrank,ibool, & + use constants, only: CUSTOM_REAL,NUMCORNERS_SHARED,myrank + + use meshfem_par, only: ibool, & NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX, & NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX, & - NSPEC2D_BOTTOM,NSPEC2D_TOP,NSPEC_REGIONS,NGLOB_REGIONS, & - NGLOB1D_RADIAL,NUMCORNERS_SHARED + NSPEC2D_BOTTOM,NSPEC2D_TOP, & + NSPEC_REGIONS,NGLOB_REGIONS, & + NGLOB1D_RADIAL use MPI_interfaces_par @@ -150,38 +153,81 @@ subroutine cmi_allocate_addressing(iregion_code) allocate(buffer_send_chunkcorn_scalar(NGLOB1D_RADIAL_CM), & buffer_recv_chunkcorn_scalar(NGLOB1D_RADIAL_CM),stat=ier) if (ier /= 0) stop 'Error allocating buffer buffer_send_chunkcorn_scalar,.. arrays' + buffer_send_chunkcorn_scalar(:) = 0.0_CUSTOM_REAL + buffer_recv_chunkcorn_scalar(:) = 0.0_CUSTOM_REAL allocate(buffer_send_chunkcorn_vector(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC), & buffer_recv_chunkcorn_vector(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC),stat=ier) if (ier /= 0) stop 'Error allocating buffer buffer_send_chunkcorn_vector,.. arrays' + buffer_send_chunkcorn_vector(:,:) = 0.0_CUSTOM_REAL + buffer_recv_chunkcorn_vector(:,:) = 0.0_CUSTOM_REAL select case (iregion_code) case (IREGION_CRUST_MANTLE) ! crust mantle - allocate(iboolcorner_crust_mantle(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED)) + allocate(iboolcorner_crust_mantle(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolcorner_crust_mantle(:,:) = 0 + allocate(iboolleft_xi_crust_mantle(NGLOB2DMAX_XMIN_XMAX_CM), & - iboolright_xi_crust_mantle(NGLOB2DMAX_XMIN_XMAX_CM)) + iboolright_xi_crust_mantle(NGLOB2DMAX_XMIN_XMAX_CM),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolleft_xi_crust_mantle(:) = 0 + iboolright_xi_crust_mantle(:) = 0 + allocate(iboolleft_eta_crust_mantle(NGLOB2DMAX_YMIN_YMAX_CM), & - iboolright_eta_crust_mantle(NGLOB2DMAX_YMIN_YMAX_CM)) - allocate(iboolfaces_crust_mantle(NGLOB2DMAX_XY,NUMFACES_SHARED)) + iboolright_eta_crust_mantle(NGLOB2DMAX_YMIN_YMAX_CM),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolleft_eta_crust_mantle(:) = 0 + iboolright_eta_crust_mantle(:) = 0 + + allocate(iboolfaces_crust_mantle(NGLOB2DMAX_XY,NUMFACES_SHARED),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolfaces_crust_mantle(:,:) = 0 case (IREGION_OUTER_CORE) ! outer core - allocate(iboolcorner_outer_core(NGLOB1D_RADIAL_OC,NUMCORNERS_SHARED)) + allocate(iboolcorner_outer_core(NGLOB1D_RADIAL_OC,NUMCORNERS_SHARED),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolcorner_outer_core(:,:) = 0 + allocate(iboolleft_xi_outer_core(NGLOB2DMAX_XMIN_XMAX_OC), & - iboolright_xi_outer_core(NGLOB2DMAX_XMIN_XMAX_OC)) + iboolright_xi_outer_core(NGLOB2DMAX_XMIN_XMAX_OC),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolleft_xi_outer_core(:) = 0 + iboolright_xi_outer_core(:) = 0 + allocate(iboolleft_eta_outer_core(NGLOB2DMAX_YMIN_YMAX_OC), & - iboolright_eta_outer_core(NGLOB2DMAX_YMIN_YMAX_OC)) - allocate(iboolfaces_outer_core(NGLOB2DMAX_XY,NUMFACES_SHARED)) + iboolright_eta_outer_core(NGLOB2DMAX_YMIN_YMAX_OC),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolleft_eta_outer_core(:) = 0 + iboolright_eta_outer_core(:) = 0 + + allocate(iboolfaces_outer_core(NGLOB2DMAX_XY,NUMFACES_SHARED),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolfaces_outer_core(:,:) = 0 case (IREGION_INNER_CORE) ! inner core - allocate(iboolcorner_inner_core(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED)) + allocate(iboolcorner_inner_core(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolcorner_inner_core(:,:) = 0 + allocate(iboolleft_xi_inner_core(NGLOB2DMAX_XMIN_XMAX_IC), & - iboolright_xi_inner_core(NGLOB2DMAX_XMIN_XMAX_IC)) + iboolright_xi_inner_core(NGLOB2DMAX_XMIN_XMAX_IC),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolleft_xi_inner_core(:) = 0 + iboolright_xi_inner_core(:) = 0 + allocate(iboolleft_eta_inner_core(NGLOB2DMAX_YMIN_YMAX_IC), & - iboolright_eta_inner_core(NGLOB2DMAX_YMIN_YMAX_IC)) - allocate(iboolfaces_inner_core(NGLOB2DMAX_XY,NUMFACES_SHARED)) + iboolright_eta_inner_core(NGLOB2DMAX_YMIN_YMAX_IC),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolleft_eta_inner_core(:) = 0 + iboolright_eta_inner_core(:) = 0 + + allocate(iboolfaces_inner_core(NGLOB2DMAX_XY,NUMFACES_SHARED),stat=ier) + if (ier /= 0) stop 'Error allocating iboolcorner_crust_mantle array' + iboolfaces_inner_core(:,:) = 0 end select @@ -343,14 +389,14 @@ subroutine cmi_get_buffers(iregion_code) call flush_IMAIN() endif call cmi_read_buffer_data(IREGION_INNER_CORE, & - NGLOB2DMAX_XMIN_XMAX(IREGION_INNER_CORE), & - NGLOB2DMAX_YMIN_YMAX(IREGION_INNER_CORE), & - NGLOB1D_RADIAL(IREGION_INNER_CORE), & - iboolleft_xi_inner_core,iboolright_xi_inner_core, & - iboolleft_eta_inner_core,iboolright_eta_inner_core, & - npoin2D_xi_inner_core,npoin2D_eta_inner_core, & - iboolfaces_inner_core,npoin2D_faces_inner_core, & - iboolcorner_inner_core) + NGLOB2DMAX_XMIN_XMAX(IREGION_INNER_CORE), & + NGLOB2DMAX_YMIN_YMAX(IREGION_INNER_CORE), & + NGLOB1D_RADIAL(IREGION_INNER_CORE), & + iboolleft_xi_inner_core,iboolright_xi_inner_core, & + iboolleft_eta_inner_core,iboolright_eta_inner_core, & + npoin2D_xi_inner_core,npoin2D_eta_inner_core, & + iboolfaces_inner_core,npoin2D_faces_inner_core, & + iboolcorner_inner_core) ! central cube buffers if (INCLUDE_CENTRAL_CUBE) then @@ -364,12 +410,11 @@ subroutine cmi_get_buffers(iregion_code) ! allocates boundary indexing arrays for central cube allocate(ibelm_xmin_inner_core(NSPEC2DMAX_XMIN_XMAX_IC), & - ibelm_xmax_inner_core(NSPEC2DMAX_XMIN_XMAX_IC), & - ibelm_ymin_inner_core(NSPEC2DMAX_YMIN_YMAX_IC), & - ibelm_ymax_inner_core(NSPEC2DMAX_YMIN_YMAX_IC), & - ibelm_top_inner_core(NSPEC2D_TOP_IC), & - ibelm_bottom_inner_core(NSPEC2D_BOTTOM_IC), & - stat=ier) + ibelm_xmax_inner_core(NSPEC2DMAX_XMIN_XMAX_IC), & + ibelm_ymin_inner_core(NSPEC2DMAX_YMIN_YMAX_IC), & + ibelm_ymax_inner_core(NSPEC2DMAX_YMIN_YMAX_IC), & + ibelm_top_inner_core(NSPEC2D_TOP_IC), & + ibelm_bottom_inner_core(NSPEC2D_BOTTOM_IC),stat=ier) if (ier /= 0 ) call exit_MPI(myrank,'Error allocating central cube index arrays') ! gets coupling arrays for inner core @@ -387,8 +432,8 @@ subroutine cmi_get_buffers(iregion_code) ! compute number of messages to expect in cube as well as their size call comp_central_cube_buffer_size(iproc_xi,iproc_eta,ichunk, & - NPROC_XI,NPROC_ETA,NSPEC2D_BOTTOM(IREGION_INNER_CORE), & - nb_msgs_theor_in_cube,npoin2D_cube_from_slices) + NPROC_XI,NPROC_ETA,NSPEC2D_BOTTOM(IREGION_INNER_CORE), & + nb_msgs_theor_in_cube,npoin2D_cube_from_slices) ! this value is used for dynamic memory allocation, therefore make sure it is never zero if (nb_msgs_theor_in_cube > 0) then @@ -396,6 +441,12 @@ subroutine cmi_get_buffers(iregion_code) else non_zero_nb_msgs_theor_in_cube = 1 endif + if (myrank == 0) then + write(IMAIN,*) ' number of messages in cube : ',nb_msgs_theor_in_cube + write(IMAIN,*) ' number of 2D points in cube: ',npoin2D_cube_from_slices + call flush_IMAIN() + endif + call synchronize_all() ! allocate buffers for cube and slices allocate(sender_from_slices_to_cube(non_zero_nb_msgs_theor_in_cube), & @@ -404,6 +455,11 @@ subroutine cmi_get_buffers(iregion_code) buffer_slices2(npoin2D_cube_from_slices,NDIM), & ibool_central_cube(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices),stat=ier) if (ier /= 0 ) call exit_MPI(myrank,'Error allocating cube buffers') + sender_from_slices_to_cube(:) = -1 + ibool_central_cube(:,:) = -1 + buffer_slices(:,:) = 0.d0 + buffer_slices2(:,:) = 0.d0 + buffer_all_cube_from_slices(:,:,:) = 0.d0 ! handles the communications with the central cube if it was included in the mesh ! create buffers to assemble with the central cube @@ -422,7 +478,12 @@ subroutine cmi_get_buffers(iregion_code) receiver_cube_from_slices,sender_from_slices_to_cube,ibool_central_cube, & buffer_slices,buffer_slices2,buffer_all_cube_from_slices) - if (myrank == 0) write(IMAIN,*) + if (myrank == 0) then + write(IMAIN,*) ' creating central cube done' + write(IMAIN,*) + call flush_IMAIN() + endif + call synchronize_all() ! frees memory deallocate(ibelm_xmin_inner_core,ibelm_xmax_inner_core) @@ -459,10 +520,10 @@ subroutine cmi_get_buffers(iregion_code) if (INCLUDE_CENTRAL_CUBE) then ! updates flags for elements on slice boundaries call fix_non_blocking_central_cube(is_on_a_slice_edge, & - ibool,NSPEC_INNER_CORE,NGLOB_INNER_CORE,nb_msgs_theor_in_cube,ibelm_bottom_inner_core, & - idoubling,npoin2D_cube_from_slices, & - ibool_central_cube,NSPEC2D_BOTTOM(IREGION_INNER_CORE), & - ichunk,NPROC_XI) + ibool,NSPEC_INNER_CORE,NGLOB_INNER_CORE,nb_msgs_theor_in_cube,ibelm_bottom_inner_core, & + idoubling,npoin2D_cube_from_slices, & + ibool_central_cube,NSPEC2D_BOTTOM(IREGION_INNER_CORE), & + ichunk,NPROC_XI) endif ! debug: saves element flags diff --git a/src/meshfem3D/create_central_cube_buffers.f90 b/src/meshfem3D/create_central_cube_buffers.f90 index f9a90fbc2..29f83bf12 100644 --- a/src/meshfem3D/create_central_cube_buffers.f90 +++ b/src/meshfem3D/create_central_cube_buffers.f90 @@ -553,7 +553,7 @@ end subroutine create_central_cube_buffers ! subroutine comp_central_cube_buffer_size(iproc_xi,iproc_eta,ichunk,NPROC_XI,NPROC_ETA,NSPEC2D_BOTTOM_INNER_CORE, & - nb_msgs_theor_in_cube,npoin2D_cube_from_slices) + nb_msgs_theor_in_cube,npoin2D_cube_from_slices) !--- compute number of messages to expect in cube as well as their size !--- take into account vertical sides and bottom side diff --git a/src/meshfem3D/create_regions_mesh.F90 b/src/meshfem3D/create_regions_mesh.F90 index 903d1089d..6fcda0876 100644 --- a/src/meshfem3D/create_regions_mesh.F90 +++ b/src/meshfem3D/create_regions_mesh.F90 @@ -1126,7 +1126,7 @@ subroutine crm_setup_indexing(npointot) if (npointot > 0) then if (myrank == 0) then write(IMAIN,*) ' total number of points : ',npointot - write(IMAIN,*) ' array memory required per process : ',dble(npointot) * dble(8) / 1024.d0 / 1024.d0,'MB' + write(IMAIN,*) ' array memory required per process : ',3.d0 * dble(npointot) * dble(8) / 1024.d0 / 1024.d0,'MB' call flush_IMAIN() endif diff --git a/src/meshfem3D/fix_non_blocking_flags.f90 b/src/meshfem3D/fix_non_blocking_flags.f90 index bc63ffb27..c33e8a2f3 100644 --- a/src/meshfem3D/fix_non_blocking_flags.f90 +++ b/src/meshfem3D/fix_non_blocking_flags.f90 @@ -114,22 +114,22 @@ subroutine fix_non_blocking_central_cube(is_on_a_slice_edge, & implicit none - integer :: nspec,nglob,nb_msgs_theor_in_cube,NSPEC2D_BOTTOM_INNER_CORE - integer :: ichunk,npoin2D_cube_from_slices,NPROC_XI + integer,intent(in) :: nspec,nglob,nb_msgs_theor_in_cube,NSPEC2D_BOTTOM_INNER_CORE + integer,intent(in) :: ichunk,npoin2D_cube_from_slices,NPROC_XI - logical, dimension(nspec) :: is_on_a_slice_edge + logical, dimension(nspec),intent(inout) :: is_on_a_slice_edge - integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec),intent(in) :: ibool - integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices) :: ibool_central_cube + integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices),intent(in) :: ibool_central_cube - integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core + integer, dimension(NSPEC2D_BOTTOM_INNER_CORE),intent(in) :: ibelm_bottom_inner_core - integer, dimension(nspec) :: idoubling_inner_core + integer, dimension(nspec),intent(in) :: idoubling_inner_core ! local parameters - logical, dimension(nglob) :: mask_ibool - integer :: ipoin,ispec,i,j,k,imsg,ispec2D + logical, dimension(:),allocatable :: mask_ibool + integer :: ipoin,ispec,i,j,k,imsg,ispec2D,ier if (ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE @@ -149,6 +149,8 @@ subroutine fix_non_blocking_central_cube(is_on_a_slice_edge, & if (ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then ! clean the mask + allocate(mask_ibool(nglob),stat=ier) + if (ier /= 0) stop 'Error allocating mask_ibool array' mask_ibool(:) = .false. do imsg = 1,nb_msgs_theor_in_cube @@ -182,6 +184,9 @@ subroutine fix_non_blocking_central_cube(is_on_a_slice_edge, & 888 continue enddo + ! free array + deallocate(mask_ibool) + endif end subroutine fix_non_blocking_central_cube From 93d15600a4052fcf5381083d27485adb033e38ad Mon Sep 17 00:00:00 2001 From: Daniel Peter Date: Fri, 2 Jun 2023 18:02:20 +0200 Subject: [PATCH 11/11] updates test compilation --- tests/meshfem3D/test_models.f90 | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/meshfem3D/test_models.f90 b/tests/meshfem3D/test_models.f90 index 82289db90..3ab898fa7 100644 --- a/tests/meshfem3D/test_models.f90 +++ b/tests/meshfem3D/test_models.f90 @@ -1,5 +1,6 @@ program test_models + use constants, only: DEGREES_TO_RADIANS,MAX_STRING_LEN,myrank use meshfem_par use manager_adios