diff --git a/.travis.yml b/.travis.yml index 8623d5a68..878ef72b6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -209,7 +209,7 @@ after_success: - | if [ "$TESTCOV" == "1" ]; then gcov --version - echo `pwd` + echo ls -al obj/ fi diff --git a/.travis/run_install.sh b/.travis/run_install.sh index 0dc2547da..ca59cd5ad 100644 --- a/.travis/run_install.sh +++ b/.travis/run_install.sh @@ -185,6 +185,7 @@ echo "export OMPI_MCA_btl_vader_single_copy_mechanism=none" >> $HOME/.tmprc echo "export OMPI_MCA_btl=^openib" >> $HOME/.tmprc echo "" -echo "exports:" -export +# commented this out due to travis reporting security issues in these outputs +#echo "exports:" +#export echo "" diff --git a/.travis/run_tests.sh b/.travis/run_tests.sh index 8984ed40b..9f579072c 100644 --- a/.travis/run_tests.sh +++ b/.travis/run_tests.sh @@ -37,7 +37,9 @@ esac # info #echo $TRAVIS_BUILD_DIR -echo $WORKDIR +# commented this out due to travis reporting security issues in these outputs +#echo $WORKDIR + echo `date` echo echo "**********************************************************" @@ -137,7 +139,7 @@ else # simulation done echo - echo "simulation done: `pwd`" + echo "simulation done: $dir" echo `date` echo @@ -152,7 +154,7 @@ if [[ $? -ne 0 ]]; then exit 1; fi # simulation done echo -echo "test done: `pwd`" +echo "test done: $dir" echo `date` echo diff --git a/Makefile.in b/Makefile.in index a8432819b..8eec80b8c 100644 --- a/Makefile.in +++ b/Makefile.in @@ -234,11 +234,11 @@ GENCODE_AMD_MI250 = --amdgpu-target=gfx90a @COND_HIP_TRUE@@COND_HIP_PLATFORM_NVIDIA_TRUE@HIP_CFLAG_ENDING = -x cu # CUDA compilation or src/gpu/*.c files # specific targets -@COND_HIP_TRUE@@COND_HIP_MI8_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI8) # --with-hip=MI8 .. -@COND_HIP_TRUE@@COND_HIP_MI25_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI25) # --with-hip=MI25 .. -@COND_HIP_TRUE@@COND_HIP_MI50_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI50) # --with-hip=MI50 .. -@COND_HIP_TRUE@@COND_HIP_MI100_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI100) # --with-hip=MI100 .. -@COND_HIP_TRUE@@COND_HIP_MI250_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI250) # --with-hip=MI250 .. +@COND_HIP_TRUE@@COND_HIP_MI8_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI8) $(FC_DEFINE)GPU_DEVICE_MI8 # --with-hip=MI8 .. +@COND_HIP_TRUE@@COND_HIP_MI25_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI25) $(FC_DEFINE)GPU_DEVICE_MI25 # --with-hip=MI25 .. +@COND_HIP_TRUE@@COND_HIP_MI50_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI50) $(FC_DEFINE)GPU_DEVICE_MI50 # --with-hip=MI50 .. +@COND_HIP_TRUE@@COND_HIP_MI100_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI100) $(FC_DEFINE)GPU_DEVICE_MI100 # --with-hip=MI100 .. +@COND_HIP_TRUE@@COND_HIP_MI250_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI250) $(FC_DEFINE)GPU_DEVICE_MI250 # --with-hip=MI250 .. @COND_HIP_TRUE@@COND_HIP_CUDA5_TRUE@GENCODE_HIP = $(GENCODE_35) # --with-hip=cuda5 .. @COND_HIP_TRUE@@COND_HIP_CUDA6_TRUE@GENCODE_HIP = $(GENCODE_37) # --with-hip=cuda6 .. diff --git a/src/gpu/boast/compute_add_sources_kernel.rb b/src/gpu/boast/compute_add_sources_kernel.rb index 9eb82ece9..150d4bcc3 100644 --- a/src/gpu/boast/compute_add_sources_kernel.rb +++ b/src/gpu/boast/compute_add_sources_kernel.rb @@ -3,18 +3,20 @@ def BOAST::compute_add_sources_kernel( ref = true, n_dim = 3, n_gllx = 5 ) push_env( :array_start => 0 ) kernel = CKernel::new function_name = "compute_add_sources_kernel" - accel = Real("accel", :dir => :inout,:dim => [ Dim() ] ) - ibool = Int("ibool", :dir => :in, :dim => [ Dim() ] ) - sourcearrays = Real("sourcearrays", :dir => :in, :dim => [ Dim() ] ) - stf_pre_compute = Real("stf_pre_compute", :size => 8, :dir => :in, :dim => [ Dim() ] ) - myrank = Int("myrank", :dir => :in) - islice_selected_source = Int("islice_selected_source", :dir => :in, :dim => [ Dim() ] ) - ispec_selected_source = Int("ispec_selected_source", :dir => :in, :dim => [ Dim() ] ) - nsources = Int("NSOURCES", :dir => :in) + accel = Real("accel", :dir => :inout,:dim => [ Dim() ] ) + ibool = Int("ibool", :dir => :in, :dim => [ Dim() ] ) + sourcearrays_local = Real("sourcearrays_local", :dir => :in, :dim => [ Dim() ] ) + stf_local = Real("stf_local", :dir => :in, :dim => [ Dim() ] ) + ispec_selected_source_local = Int("ispec_selected_source_local", :dir => :in, :dim => [ Dim() ] ) + nsources_local = Int("nsources_local", :dir => :in) + nstep = Int("NSTEP", :dir => :in) + it = Int("it", :dir => :in) + istage = Int("istage", :dir => :in) ndim = Int("NDIM", :const => n_dim) ngllx = Int("NGLLX", :const => n_gllx) - p = Procedure(function_name, [accel,ibool,sourcearrays,stf_pre_compute,myrank,islice_selected_source,ispec_selected_source,nsources]) + + p = Procedure(function_name, [accel,ibool,sourcearrays_local,stf_local,ispec_selected_source_local,nsources_local,nstep,it,istage]) if (get_lang == CUDA and ref) then get_output.print File::read("references/#{function_name}.cu") elsif(get_lang == CUDA or get_lang == CL or get_lang == HIP) then @@ -39,14 +41,12 @@ def BOAST::compute_add_sources_kernel( ref = true, n_dim = 3, n_gllx = 5 ) print k === get_local_id(2) print isource === get_group_id(0) + get_num_groups(0)*get_group_id(1) - print If(isource < nsources) { - print If(myrank == islice_selected_source[isource]) { - print ispec === ispec_selected_source[isource] - 1 - print stf === stf_pre_compute[isource] - print iglob === ibool[INDEX4(ngllx,ngllx,ngllx,i,j,k,ispec)] - 1 - (0..2).each { |indx| - print atomicAdd(accel+iglob*3+indx, sourcearrays[INDEX5(ndim,ngllx,ngllx,ngllx,indx,i,j,k,isource)]*stf) - } + print If(isource < nsources_local) { + print ispec === ispec_selected_source_local[isource] - 1 + print stf === stf_local[INDEX3(nsources_local,nstep,isource,it,istage)] + print iglob === ibool[INDEX4(ngllx,ngllx,ngllx,i,j,k,ispec)] - 1 + (0..2).each { |indx| + print atomicAdd(accel+iglob*3+indx, sourcearrays_local[INDEX5(ndim,ngllx,ngllx,ngllx,indx,i,j,k,isource)]*stf) } } close p @@ -57,4 +57,66 @@ def BOAST::compute_add_sources_kernel( ref = true, n_dim = 3, n_gllx = 5 ) kernel.procedure = p return kernel end + +# obsolete kernel, left here for reference... +# +# def BOAST::compute_add_sources_kernel( ref = true, n_dim = 3, n_gllx = 5 ) +# push_env( :array_start => 0 ) +# kernel = CKernel::new +# function_name = "compute_add_sources_kernel" +# accel = Real("accel", :dir => :inout,:dim => [ Dim() ] ) +# ibool = Int("ibool", :dir => :in, :dim => [ Dim() ] ) +# sourcearrays = Real("sourcearrays", :dir => :in, :dim => [ Dim() ] ) +# stf_pre_compute = Real("stf_pre_compute", :size => 8, :dir => :in, :dim => [ Dim() ] ) +# myrank = Int("myrank", :dir => :in) +# islice_selected_source = Int("islice_selected_source", :dir => :in, :dim => [ Dim() ] ) +# ispec_selected_source = Int("ispec_selected_source", :dir => :in, :dim => [ Dim() ] ) +# nsources = Int("NSOURCES", :dir => :in) +# +# ndim = Int("NDIM", :const => n_dim) +# ngllx = Int("NGLLX", :const => n_gllx) +# p = Procedure(function_name, [accel,ibool,sourcearrays,stf_pre_compute,myrank,islice_selected_source,ispec_selected_source,nsources]) +# if (get_lang == CUDA and ref) then +# get_output.print File::read("references/#{function_name}.cu") +# elsif(get_lang == CUDA or get_lang == CL or get_lang == HIP) then +# make_specfem3d_header( :ndim => n_dim, :ngllx => n_gllx, :double => true ) +# open p +# ispec = Int( "ispec") +# iglob = Int( "iglob") +# stf = Real("stf") +# isource = Int( "isource") +# i = Int( "i") +# j = Int( "j") +# k = Int( "k") +# decl ispec +# decl iglob +# decl stf +# decl isource +# decl i +# decl j +# decl k +# print i === get_local_id(0) +# print j === get_local_id(1) +# print k === get_local_id(2) +# print isource === get_group_id(0) + get_num_groups(0)*get_group_id(1) +# +# print If(isource < nsources) { +# print If(myrank == islice_selected_source[isource]) { +# print ispec === ispec_selected_source[isource] - 1 +# print stf === stf_pre_compute[isource] +# print iglob === ibool[INDEX4(ngllx,ngllx,ngllx,i,j,k,ispec)] - 1 +# (0..2).each { |indx| +# print atomicAdd(accel+iglob*3+indx, sourcearrays[INDEX5(ndim,ngllx,ngllx,ngllx,indx,i,j,k,isource)]*stf) +# } +# } +# } +# close p +# else +# raise "Unsupported language!" +# end +# pop_env(:array_start) +# kernel.procedure = p +# return kernel +# end + end diff --git a/src/gpu/boast/compute_seismograms_kernel.rb b/src/gpu/boast/compute_seismograms_kernel.rb index 1259dff3b..a03463d66 100644 --- a/src/gpu/boast/compute_seismograms_kernel.rb +++ b/src/gpu/boast/compute_seismograms_kernel.rb @@ -1,5 +1,5 @@ module BOAST - def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_gll3_padded = 128) + def BOAST::compute_seismograms_kernel(ref = false, n_dim = 3, n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_gll3_padded = 128) push_env( :array_start => 0 ) kernel = CKernel::new v = [] @@ -15,7 +15,9 @@ def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_g v.push ispec_selected_rec = Int("ispec_selected_rec", :dir => :in, :dim => [ Dim() ]) v.push number_receiver_global = Int("number_receiver_global", :dir => :in, :dim => [ Dim() ]) v.push scale_displ = Real("scale_displ", :dir => :in) + v.push seismo_current = Int("seismo_current", :dir => :in) + ndim = Int("NDIM", :const => n_dim) ngllx = Int("NGLLX", :const => n_gllx) ngll2 = Int("NGLL2", :const => n_gll2) ngll3 = Int("NGLL3", :const => n_gll3) @@ -23,7 +25,7 @@ def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_g p = Procedure(function_name, v ) if(get_lang == CL or get_lang == CUDA or get_lang == HIP) then - make_specfem3d_header( :ngllx => n_gllx, :ngll2 => n_gll2, :ngll3 => n_gll3, :ngll3_padded => n_gll3_padded ) + make_specfem3d_header( :ndim => n_dim, :ngllx => n_gllx, :ngll2 => n_gll2, :ngll3 => n_gll3, :ngll3_padded => n_gll3_padded ) open p ispec = Int("ispec") iglob = Int("iglob") @@ -36,6 +38,7 @@ def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_g k = Int("k") l = Int("l") s = Int("s") + idx = Int("idx") decl ispec decl iglob decl irec_local @@ -47,6 +50,7 @@ def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_g decl k decl l decl s + decl idx decl sh_dxd = Real("sh_dxd", :local => true, :dim => [Dim(ngll3_padded)] ) decl sh_dyd = Real("sh_dyd", :local => true, :dim => [Dim(ngll3_padded)] ) @@ -96,18 +100,21 @@ def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_g } comment() + print idx === INDEX3(ndim,nrec_local,0,irec_local,seismo_current) + comment() + print If (tx == 0) { - print seismograms[irec_local*3 + 0] === scale_displ * ( + print seismograms[idx + 0] === scale_displ * ( nu[(irec_local*3)*3 + 0]*sh_dxd[0] + nu[(irec_local*3 + 1)*3 + 0]*sh_dyd[0] + nu[(irec_local*3 + 2)*3 + 0]*sh_dzd[0] ) } print If (tx == 1) { - print seismograms[irec_local*3 + 1] === scale_displ * ( + print seismograms[idx + 1] === scale_displ * ( nu[(irec_local*3)*3 + 1]*sh_dxd[0] + nu[(irec_local*3 + 1)*3 + 1]*sh_dyd[0] + nu[(irec_local*3 + 2)*3 + 1]*sh_dzd[0] ) } print If (tx == 2) { - print seismograms[irec_local*3 + 2] === scale_displ * ( + print seismograms[idx + 2] === scale_displ * ( nu[(irec_local*3)*3 + 2]*sh_dxd[0] + nu[(irec_local*3 + 1)*3 + 2]*sh_dyd[0] + nu[(irec_local*3 + 2)*3 + 2]*sh_dzd[0] ) } diff --git a/src/gpu/boast/inner_core_impl_kernel_forward.rb b/src/gpu/boast/inner_core_impl_kernel_forward.rb index 8621a6223..4e349b7cc 100644 --- a/src/gpu/boast/inner_core_impl_kernel_forward.rb +++ b/src/gpu/boast/inner_core_impl_kernel_forward.rb @@ -1032,7 +1032,7 @@ def BOAST::impl_kernel(type, forward, ref = true, elem_per_thread = 1, mesh_colo comment() # compilation pragma info - if (type == :crust_mantle and get_lang == CUDA and forward) then + if (type == :crust_mantle and (get_lang == CUDA or get_lang == HIP) and forward) then get_output.puts "#ifdef #{manually_unrolled_loops}" get_output.puts "#pragma message (\"\\n\\nCompiling with: #{manually_unrolled_loops} enabled\\n\")" get_output.puts "#endif // #{manually_unrolled_loops}" diff --git a/src/gpu/compute_add_sources_elastic_gpu.c b/src/gpu/compute_add_sources_elastic_gpu.c index 51a7795a0..3f8ba0f1a 100644 --- a/src/gpu/compute_add_sources_elastic_gpu.c +++ b/src/gpu/compute_add_sources_elastic_gpu.c @@ -32,8 +32,8 @@ extern EXTERN_LANG void FC_FUNC_ (compute_add_sources_gpu, COMPUTE_ADD_SOURCES_GPU) (long *Mesh_pointer_f, - int *NSOURCESf, - double *h_stf_pre_compute) { + int *it_f, + int *istage_f) { TRACE ("compute_add_sources_gpu"); @@ -43,13 +43,22 @@ void FC_FUNC_ (compute_add_sources_gpu, // checks if anything to do if (mp->nsources_local == 0) return; - int NSOURCES = *NSOURCESf; + int it = *it_f - 1; // -1 for Fortran -> C indexing differences + int istage = *istage_f - 1; int num_blocks_x, num_blocks_y; - get_blocks_xy (NSOURCES, &num_blocks_x, &num_blocks_y); - + get_blocks_xy (mp->nsources_local, &num_blocks_x, &num_blocks_y); + + // note: we try to avoid these copies `gpuCopy_todevice**` as they synchronize all streams and kernel executions. + // this compute_add_sources_kernel is executing faster for a small number of sources than the next kernel launches, + // thus we can not overlap and compute in an asynchronous way, but need to wait idle for kernel triggering. + // + // to avoid this, we now allocate all local source time function arrays on the GPU in the preparation routine. + // this will need more GPU memory and might need to be re-evaluated if the number of sources and time steps become very large. + // // copies source time function buffer values to GPU - gpuCopy_todevice_double (&mp->d_stf_pre_compute, h_stf_pre_compute, NSOURCES); + //gpuCopy_todevice_double (&mp->d_stf_pre_compute, h_stf_pre_compute, NSOURCES); + #ifdef USE_OPENCL if (run_opencl) { @@ -60,12 +69,13 @@ void FC_FUNC_ (compute_add_sources_gpu, // adds source contributions clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_accel_crust_mantle.ocl)); clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ibool_crust_mantle.ocl)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_sourcearrays.ocl)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_stf_pre_compute.ocl)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->myrank)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_islice_selected_source.ocl)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ispec_selected_source.ocl)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &NSOURCES)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_sourcearrays_local.ocl)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_stf_local.ocl)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ispec_selected_source_local.ocl)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->nsources_local)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->NSTEP)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &it)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &istage)); local_work_size[0] = NGLLX; local_work_size[1] = NGLLX; @@ -85,12 +95,13 @@ void FC_FUNC_ (compute_add_sources_gpu, // adds source contributions compute_add_sources_kernel<<compute_stream>>>(mp->d_accel_crust_mantle.cuda, mp->d_ibool_crust_mantle.cuda, - mp->d_sourcearrays.cuda, - mp->d_stf_pre_compute.cuda, - mp->myrank, - mp->d_islice_selected_source.cuda, - mp->d_ispec_selected_source.cuda, - NSOURCES); + mp->d_sourcearrays_local.cuda, + mp->d_stf_local.cuda, + mp->d_ispec_selected_source_local.cuda, + mp->nsources_local, + mp->NSTEP, + it, + istage); } #endif #ifdef USE_HIP @@ -101,12 +112,13 @@ void FC_FUNC_ (compute_add_sources_gpu, hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_add_sources_kernel), grid, threads, 0, mp->compute_stream, mp->d_accel_crust_mantle.hip, mp->d_ibool_crust_mantle.hip, - mp->d_sourcearrays.hip, - mp->d_stf_pre_compute.hip, - mp->myrank, - mp->d_islice_selected_source.hip, - mp->d_ispec_selected_source.hip, - NSOURCES); + mp->d_sourcearrays_local.hip, + mp->d_stf_local.hip, + mp->d_ispec_selected_source_local.hip, + mp->nsources_local, + mp->NSTEP, + it, + istage); } #endif @@ -120,8 +132,8 @@ void FC_FUNC_ (compute_add_sources_gpu, extern EXTERN_LANG void FC_FUNC_ (compute_add_sources_backward_gpu, COMPUTE_ADD_SOURCES_BACKWARD_GPU) (long *Mesh_pointer_f, - int *NSOURCESf, - double *h_stf_pre_compute) { + int *it_tmp_f, + int *istage_f) { TRACE ("compute_add_sources_backward_gpu"); // debug DEBUG_BACKWARD_SOURCES (); @@ -132,13 +144,44 @@ void FC_FUNC_ (compute_add_sources_backward_gpu, // checks if anything to do if (mp->nsources_local == 0) return; - int NSOURCES = *NSOURCESf; + // indexing: for default backward, we can use the forward stf_local array with corresponding indexing. + // for example, Newmark stepping: + // backward time_t = dble(NSTEP-it_tmp)*DT - t0 with it_tmp 1..NSTEP + // forward time_t = dble(it_tmp-1)*DT - t0 with it_tmp 1..NSTEP + // -> backward indexing: + // it_tmp_backward == 1 -> it_tmp_forward = NSTEP + // it_tmp_backward == 2 -> it_tmp_forward = NSTEP-1 + // .. + // it_tmp_backward == NSTEP -> it_tmp_forward = 1 + // and b_stf_local == stf_local(istage, NSTEP-(it_tmp_backward-1), isource_local) + // only exception is LDDRK and non-undo_attenuation stepping + gpu_realw_mem stf_local; + int it_forward; + + stf_local = mp->d_stf_local; + it_forward = mp->NSTEP - (*it_tmp_f - 1); + + // LDDRK case for non-undo-attenuation stepping + if (mp->use_b_stf){ + stf_local = mp->d_b_stf_local; + it_forward = *it_tmp_f; // no need to reverse order, b_stf_local is stored in same it_tmp 1..NSTEP order for backward stepping + } - int num_blocks_x, num_blocks_y; - get_blocks_xy (NSOURCES, &num_blocks_x, &num_blocks_y); + int it = it_forward - 1; // -1 for Fortran -> C indexing differences + int istage = *istage_f - 1; + int num_blocks_x, num_blocks_y; + get_blocks_xy (mp->nsources_local, &num_blocks_x, &num_blocks_y); + + // note: we try to avoid these copies `gpuCopy_todevice**` as they synchronize all streams and kernel executions. + // this compute_add_sources_kernel is executing faster for a small number of sources than the next kernel launches, + // thus we can not overlap and compute in an asynchronous way, but need to wait idle for kernel triggering. + // + // to avoid this, we now allocate all local source time function arrays on the GPU in the preparation routine. + // this will need more GPU memory and might need to be re-evaluated if the number of sources and time steps become very large. + // // copies source time function buffer values to GPU - gpuCopy_todevice_double (&mp->d_stf_pre_compute, h_stf_pre_compute, NSOURCES); + //gpuCopy_todevice_double (&mp->d_stf_pre_compute, h_stf_pre_compute, NSOURCES); #ifdef USE_OPENCL if (run_opencl) { @@ -149,12 +192,13 @@ void FC_FUNC_ (compute_add_sources_backward_gpu, // adds source contributions clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_b_accel_crust_mantle.ocl)); clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ibool_crust_mantle.ocl)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_sourcearrays.ocl)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_stf_pre_compute.ocl)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->myrank)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_islice_selected_source.ocl)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ispec_selected_source.ocl)); - clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &NSOURCES)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_sourcearrays_local.ocl)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &stf_local.ocl)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ispec_selected_source_local.ocl)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->nsources_local)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->NSTEP)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &it)); + clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &istage)); local_work_size[0] = NGLLX; local_work_size[1] = NGLLX; @@ -174,12 +218,13 @@ void FC_FUNC_ (compute_add_sources_backward_gpu, compute_add_sources_kernel<<compute_stream>>>(mp->d_b_accel_crust_mantle.cuda, mp->d_ibool_crust_mantle.cuda, - mp->d_sourcearrays.cuda, - mp->d_stf_pre_compute.cuda, - mp->myrank, - mp->d_islice_selected_source.cuda, - mp->d_ispec_selected_source.cuda, - NSOURCES); + mp->d_sourcearrays_local.cuda, + stf_local.cuda, + mp->d_ispec_selected_source_local.cuda, + mp->nsources_local, + mp->NSTEP, + it, + istage); } #endif #ifdef USE_HIP @@ -190,12 +235,13 @@ void FC_FUNC_ (compute_add_sources_backward_gpu, hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_add_sources_kernel), grid, threads, 0, mp->compute_stream, mp->d_b_accel_crust_mantle.hip, mp->d_ibool_crust_mantle.hip, - mp->d_sourcearrays.hip, - mp->d_stf_pre_compute.hip, - mp->myrank, - mp->d_islice_selected_source.hip, - mp->d_ispec_selected_source.hip, - NSOURCES); + mp->d_sourcearrays_local.hip, + stf_local.hip, + mp->d_ispec_selected_source_local.hip, + mp->nsources_local, + mp->NSTEP, + it, + istage); } #endif diff --git a/src/gpu/compute_forces_crust_mantle_gpu.c b/src/gpu/compute_forces_crust_mantle_gpu.c index 70f124da5..67fe3b314 100644 --- a/src/gpu/compute_forces_crust_mantle_gpu.c +++ b/src/gpu/compute_forces_crust_mantle_gpu.c @@ -203,6 +203,10 @@ void crust_mantle (int nb_blocks_to_compute, Mesh *mp, tau_sigmainvval = mp->d_tau_sigmainvval; // only d_tau_sigmainvval } + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -794,6 +798,21 @@ void crust_mantle (int nb_blocks_to_compute, Mesh *mp, } #endif + // kernel timing + if (GPU_TIMING){ + stop_timing_gpu(&start,&stop,"crust_mantle"); + + //realw time; + //stop_timing_gpu_t(&start,&stop,"crust_mantle kernel",&time); + // time in seconds + //time = time / 1000.; + // performance + // see with: nvprof --metrics flops_sp ./xspecfem3D -> using 883146240 FLOPS (Single) floating-point operations + // hand-counts: 89344 * number-of-blocks + //realw flops = 89344 * nb_blocks_to_compute; + //printf(" performance: %f GFlops/s\n", flops/time *(1./1000./1000./1000.)); + } + GPU_ERROR_CHECKING ("crust_mantle"); } diff --git a/src/gpu/compute_forces_inner_core_gpu.c b/src/gpu/compute_forces_inner_core_gpu.c index baf48fc88..968f3b6d5 100644 --- a/src/gpu/compute_forces_inner_core_gpu.c +++ b/src/gpu/compute_forces_inner_core_gpu.c @@ -176,6 +176,10 @@ void inner_core (int nb_blocks_to_compute, Mesh *mp, tau_sigmainvval = mp->d_tau_sigmainvval; // only d_tau_sigmainvval } + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -638,6 +642,9 @@ void inner_core (int nb_blocks_to_compute, Mesh *mp, } #endif + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"inner_core"); } + GPU_ERROR_CHECKING ("inner_core"); } diff --git a/src/gpu/compute_forces_outer_core_gpu.c b/src/gpu/compute_forces_outer_core_gpu.c index 92e48b713..15cb68459 100644 --- a/src/gpu/compute_forces_outer_core_gpu.c +++ b/src/gpu/compute_forces_outer_core_gpu.c @@ -109,6 +109,10 @@ void outer_core (int nb_blocks_to_compute, Mesh *mp, B_array_rotation_lddrk = d_b_B_array_rotation_lddrk; } + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -282,6 +286,9 @@ void outer_core (int nb_blocks_to_compute, Mesh *mp, } #endif + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"outer_core"); } + GPU_ERROR_CHECKING ("kernel outer_core"); } diff --git a/src/gpu/compute_seismograms_gpu.c b/src/gpu/compute_seismograms_gpu.c index e26fb608b..1eb4de1b5 100644 --- a/src/gpu/compute_seismograms_gpu.c +++ b/src/gpu/compute_seismograms_gpu.c @@ -31,34 +31,36 @@ extern EXTERN_LANG void FC_FUNC_ (compute_seismograms_gpu, - COMPUTE_SEISMOGRAMS_GPU) (long *Mesh_pointer_f, + COMPUTE_SEISMOGRAMS_GPU) (long *Mesh_pointer, realw* seismograms, - int* seismo_currentf, - int* itf, - int* it_endf, - double* scale_displf, - int* NTSTEP_BETWEEN_OUTPUT_SEISMOSf, - int* NSTEPf) { + int* seismo_current_f, + int* it_f, + int* it_end_f, + double* scale_displ_f, + int* nlength_seismogram_f) { TRACE ("compute_seismograms_gpu"); //get Mesh from Fortran integer wrapper - Mesh *mp = (Mesh *) *Mesh_pointer_f; + Mesh *mp = (Mesh *) *Mesh_pointer; // checks if anything to do if (mp->nrec_local == 0) return; - int seismo_current = *seismo_currentf - 1 ; - int NTSTEP_BETWEEN_OUTPUT_SEISMOS = *NTSTEP_BETWEEN_OUTPUT_SEISMOSf; - int NSTEP = *NSTEPf; - int it = *itf; - int it_end = *it_endf; - realw scale_displ = (realw)(*scale_displf); + int seismo_current = *seismo_current_f - 1 ; // -1 for Fortran -> C indexing + + int nlength_seismogram = *nlength_seismogram_f; + int it = *it_f; + int it_end = *it_end_f; + + realw scale_displ = (realw)(*scale_displ_f); int blocksize = NGLL3_PADDED; int num_blocks_x, num_blocks_y; get_blocks_xy (mp->nrec_local, &num_blocks_x, &num_blocks_y); - int size_buffer = 3 * mp->nrec_local * sizeof(realw); + // seismogram array size + int size = mp->nrec_local * nlength_seismogram; + int size_buffer = NDIM * size * sizeof(realw); // determines wavefield depending on simulation type gpu_realw_mem displ; @@ -70,6 +72,15 @@ void FC_FUNC_ (compute_seismograms_gpu, displ = mp->d_displ_crust_mantle; } + // note: due to subsampling, the last time step it == it_end might not be reached, + // but computing seismogram entries might end before. + // thus, both checks + // it%NTSTEP_BETWEEN_OUTPUT_SEISMOS == 0 || it == it_end + // might not be reached. instead we test if the seismogram array is full by + // seismo_current == nlength_seismogram - 1 + // and copy it back whenever. + //printf("debug: gpu seismo: seismo current/lenght %i/%i - it/it_end %i/%i\n",seismo_current,nlength_seismogram,it,it_end); + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -89,6 +100,7 @@ void FC_FUNC_ (compute_seismograms_gpu, clCheck (clSetKernelArg (mocl.kernels.compute_seismograms_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ispec_selected_rec.ocl)); clCheck (clSetKernelArg (mocl.kernels.compute_seismograms_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_number_receiver_global.ocl)); clCheck (clSetKernelArg (mocl.kernels.compute_seismograms_kernel, idx++, sizeof (realw), (void *) &scale_displ)); + clCheck (clSetKernelArg (mocl.kernels.compute_seismograms_kernel, idx++, sizeof (int), (void *) &seismo_current)); local_work_size[0] = blocksize; local_work_size[1] = 1; @@ -98,21 +110,29 @@ void FC_FUNC_ (compute_seismograms_gpu, clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.compute_seismograms_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt)); - // copies buffer to CPU - if (GPU_ASYNC_COPY && ((seismo_current+1) != NTSTEP_BETWEEN_OUTPUT_SEISMOS) && (it != NSTEP) && (it != it_end) ) { - // waits until kernel is finished before starting async memcpy - clCheck (clFinish (mocl.command_queue)); - - if (mp->has_last_copy_evt) { - clCheck (clReleaseEvent (mp->last_copy_evt)); - } - clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_seismograms.ocl, CL_FALSE, 0, - size_buffer, seismograms + 3*mp->nrec_local*seismo_current, 1, &kernel_evt, &mp->last_copy_evt)); - mp->has_last_copy_evt = 1; - } else { - // blocking copy - clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_seismograms.ocl, CL_TRUE, 0, - size_buffer , seismograms + 3*mp->nrec_local*seismo_current, 0, NULL, NULL)); + // copies array to CPU host + if (seismo_current == nlength_seismogram - 1 || it == it_end){ + // intermediate copies could be made asynchronous, + // only the last copy would be a synchronous copy to make sure we have all previous copies done. + // + // however, note that asynchronous copies in the copy stream would also require pinned host memory. + // This is not use here yet for seismograms, thus the copy becomes blocking by default as well. + // we'll leave the if-case here as a todo for future... + //if (GPU_ASYNC_COPY && (it != it_end)) { + // // waits until kernel is finished before starting async memcpy + // clCheck (clFinish (mocl.command_queue)); + // + // if (mp->has_last_copy_evt) { + // clCheck (clReleaseEvent (mp->last_copy_evt)); + // } + // clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_seismograms.ocl, CL_FALSE, 0, + // size_buffer, seismograms, 1, &kernel_evt, &mp->last_copy_evt)); + // mp->has_last_copy_evt = 1; + //} else { + // blocking copy + clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_seismograms.ocl, CL_TRUE, 0, + size_buffer , seismograms, 0, NULL, NULL)); + //} } clReleaseEvent (kernel_evt); @@ -132,17 +152,25 @@ void FC_FUNC_ (compute_seismograms_gpu, mp->d_nu.cuda, mp->d_ispec_selected_rec.cuda, mp->d_number_receiver_global.cuda, - scale_displ); - // copies buffer to CPU - if (GPU_ASYNC_COPY && ((seismo_current+1) != NTSTEP_BETWEEN_OUTPUT_SEISMOS) && (it != NSTEP) && (it != it_end) ) { - // waits until kernel is finished before starting async memcpy - cudaStreamSynchronize(mp->compute_stream); - // copies buffer to CPU - cudaMemcpyAsync(seismograms + 3*mp->nrec_local*seismo_current,mp->d_seismograms.cuda,size_buffer,cudaMemcpyDeviceToHost,mp->copy_stream); - } else { - // synchronous copy - print_CUDA_error_if_any(cudaMemcpy(seismograms + 3*mp->nrec_local*seismo_current,mp->d_seismograms.cuda,size_buffer, - cudaMemcpyDeviceToHost),98000); + scale_displ, + seismo_current); + // copies array to CPU host + if (seismo_current == nlength_seismogram - 1 || it == it_end){ + // intermediate copies could be made asynchronous, + // only the last copy would be a synchronous copy to make sure we have all previous copies done. + // + // however, note that asynchronous copies in the copy stream would also require pinned host memory. + // This is not use here yet for seismograms, thus the copy becomes blocking by default as well. + // we'll leave the if-case here as a todo for future... + //if (GPU_ASYNC_COPY && (it != it_end)) { + // // waits until kernel is finished before starting async memcpy + // cudaStreamSynchronize(mp->compute_stream); + // // copies buffer to CPU + // cudaMemcpyAsync(seismograms,mp->d_seismograms.cuda,size_buffer,cudaMemcpyDeviceToHost,mp->copy_stream); + //} else { + // synchronous copy + print_CUDA_error_if_any(cudaMemcpy(seismograms,mp->d_seismograms.cuda,size_buffer,cudaMemcpyDeviceToHost),98000); + //} } } #endif @@ -161,17 +189,26 @@ void FC_FUNC_ (compute_seismograms_gpu, mp->d_nu.hip, mp->d_ispec_selected_rec.hip, mp->d_number_receiver_global.hip, - scale_displ); - // copies buffer to CPU - if (GPU_ASYNC_COPY && ((seismo_current+1) != NTSTEP_BETWEEN_OUTPUT_SEISMOS) && (it != NSTEP) && (it != it_end) ) { - // waits until kernel is finished before starting async memcpy - hipStreamSynchronize(mp->compute_stream); - // copies buffer to CPU - hipMemcpyAsync(seismograms + 3*mp->nrec_local*seismo_current,mp->d_seismograms.hip,size_buffer,hipMemcpyDeviceToHost,mp->copy_stream); - } else { - // synchronous copy - print_HIP_error_if_any(hipMemcpy(seismograms + 3*mp->nrec_local*seismo_current,mp->d_seismograms.hip,size_buffer, - hipMemcpyDeviceToHost),98000); + scale_displ, + seismo_current); + + // copies array to CPU host + if (seismo_current == nlength_seismogram - 1 || it == it_end){ + // intermediate copies could be made asynchronous, + // only the last copy would be a synchronous copy to make sure we have all previous copies done. + // + // however, note that asynchronous copies in the copy stream would also require pinned host memory. + // This is not use here yet for seismograms, thus the copy becomes blocking by default as well. + // we'll leave the if-case here as a todo for future... + //if (GPU_ASYNC_COPY && (it != it_end)) { + // // waits until kernel is finished before starting async memcpy + // hipStreamSynchronize(mp->compute_stream); + // // copies buffer to CPU + // hipMemcpyAsync(seismograms,mp->d_seismograms.hip,size_buffer,hipMemcpyDeviceToHost,mp->copy_stream); + //} else { + // synchronous copy + print_HIP_error_if_any(hipMemcpy(seismograms,mp->d_seismograms.hip,size_buffer,hipMemcpyDeviceToHost),98000); + //} } } #endif diff --git a/src/gpu/gpu_buffer_list.c b/src/gpu/gpu_buffer_list.c index 462354069..f51b16676 100644 --- a/src/gpu/gpu_buffer_list.c +++ b/src/gpu/gpu_buffer_list.c @@ -311,11 +311,19 @@ GPU_REALW_BUFFER (d_b_B_array_rotation); // ------------------------------------------------------------------ // // sources // ------------------------------------------------------------------ // -GPU_REALW_BUFFER (d_sourcearrays); -GPU_DOUBLE_BUFFER (d_stf_pre_compute); -GPU_INT_BUFFER (d_islice_selected_source); +// full NSOURCES arrays not needed anymore... +//GPU_REALW_BUFFER (d_sourcearrays); +//GPU_DOUBLE_BUFFER (d_stf_pre_compute); +//GPU_INT_BUFFER (d_islice_selected_source); +// for pure adjoint simulation cases GPU_INT_BUFFER (d_ispec_selected_source); +// local arrays +GPU_INT_BUFFER (d_ispec_selected_source_local); +GPU_REALW_BUFFER (d_sourcearrays_local); +GPU_REALW_BUFFER (d_stf_local); +GPU_REALW_BUFFER (d_b_stf_local); + // ------------------------------------------------------------------ // // receivers // ------------------------------------------------------------------ // diff --git a/src/gpu/helper_functions_gpu.c b/src/gpu/helper_functions_gpu.c index b6023d3af..1193db2cc 100644 --- a/src/gpu/helper_functions_gpu.c +++ b/src/gpu/helper_functions_gpu.c @@ -491,7 +491,7 @@ void gpuCopy_from_device_realw (gpu_realw_mem *d_array_addr_ptr, realw *h_array, // copy with offset void gpuCopy_from_device_realw_offset (gpu_realw_mem *d_array_addr_ptr, realw *h_array, size_t size, size_t offset) { - TRACE ("gpuCopy_from_device_realw"); + TRACE ("gpuCopy_from_device_realw_offset"); // checks if anything to do if (size == 0){ return; } @@ -515,7 +515,7 @@ void gpuCopy_from_device_realw_offset (gpu_realw_mem *d_array_addr_ptr, realw *h } #endif - GPU_ERROR_CHECKING ("gpuCopy_from_device_realw"); + GPU_ERROR_CHECKING ("gpuCopy_from_device_realw_offset"); } /* ----------------------------------------------------------------------------------------------- */ @@ -1154,7 +1154,7 @@ void start_timing_gpu(gpu_event* start,gpu_event* stop) { #endif } -void stop_timing_gpu(gpu_event* start,gpu_event* stop, char* info_str) { +void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str) { realw time = 0; // stops events #ifdef USE_OPENCL @@ -1177,7 +1177,40 @@ void stop_timing_gpu(gpu_event* start,gpu_event* stop, char* info_str) { hipEventDestroy( *stop ); #endif // user output - printf("%s: Execution Time = %f ms\n",info_str,time); + printf("GPU_timing %s: Execution Time = %f ms\n",info_str,time); +} + +// note: C doesn't allow for function overloading. +// thus, naming this routine slightly different, with `_t` added, as it returns back time t. + +void stop_timing_gpu_t(gpu_event* start,gpu_event* stop, const char* info_str, realw* t){ + realw time = 0; + // stops events +#ifdef USE_OPENCL +// not fully implemented yet... + clReleaseEvent(*start); + clReleaseEvent(*stop); +#endif +#ifdef USE_CUDA + // stops events + cudaEventRecord( *stop, 0); + cudaEventSynchronize( *stop ); + cudaEventElapsedTime( &time, *start, *stop ); + cudaEventDestroy( *start ); + cudaEventDestroy( *stop ); +#endif +#ifdef USE_HIP + // stops events + hipEventRecord( *stop, 0); + hipEventSynchronize( *stop ); + hipEventElapsedTime( &time, *start, *stop ); + hipEventDestroy( *start ); + hipEventDestroy( *stop ); +#endif + // user output + printf("GPU_timing %s: Execution Time = %f ms\n",info_str,time); + // returns time + *t = time; } /* ----------------------------------------------------------------------------------------------- */ @@ -1331,6 +1364,22 @@ void FC_FUNC_ (pause_for_debug, pause_for_debugger (1); } +/*----------------------------------------------------------------------------------------------- */ + +// external function wrapper to synchronize GPU from Fortran routine + +extern EXTERN_LANG +void FC_FUNC_ (gpu_synchronize, + GPU_SYNCHRONIZE) () { + TRACE ("gpu_synchronize"); + + // synchronize device kernels + gpuSynchronize(); + + // checks for previous errors + exit_on_gpu_error("gpuSynchronize"); +} + /* ----------------------------------------------------------------------------------------------- */ // MPI synchronization /* ----------------------------------------------------------------------------------------------- */ diff --git a/src/gpu/kernels.gen/compute_add_sources_kernel.cpp b/src/gpu/kernels.gen/compute_add_sources_kernel.cpp index 6a52209c6..759b84fda 100644 --- a/src/gpu/kernels.gen/compute_add_sources_kernel.cpp +++ b/src/gpu/kernels.gen/compute_add_sources_kernel.cpp @@ -81,7 +81,7 @@ #define BLOCKSIZE_TRANSFER 256 #endif -__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays, const double * stf_pre_compute, const int myrank, const int * islice_selected_source, const int * ispec_selected_source, const int NSOURCES){ +__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays_local, const float * stf_local, const int * ispec_selected_source_local, const int nsources_local, const int NSTEP, const int it, const int istage){ int ispec; int iglob; float stf; @@ -93,14 +93,12 @@ __global__ void compute_add_sources_kernel(float * accel, const int * ibool, con j = threadIdx.y; k = threadIdx.z; isource = blockIdx.x + (gridDim.x) * (blockIdx.y); - if (isource < NSOURCES) { - if (myrank == islice_selected_source[isource]) { - ispec = ispec_selected_source[isource] - (1); - stf = stf_pre_compute[isource]; - iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1); - atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf)); - atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf)); - atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf)); - } + if (isource < nsources_local) { + ispec = ispec_selected_source_local[isource] - (1); + stf = stf_local[INDEX3(nsources_local, NSTEP, isource, it, istage)]; + iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1); + atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf)); + atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf)); + atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf)); } } diff --git a/src/gpu/kernels.gen/compute_add_sources_kernel.cu b/src/gpu/kernels.gen/compute_add_sources_kernel.cu index 6a52209c6..759b84fda 100644 --- a/src/gpu/kernels.gen/compute_add_sources_kernel.cu +++ b/src/gpu/kernels.gen/compute_add_sources_kernel.cu @@ -81,7 +81,7 @@ #define BLOCKSIZE_TRANSFER 256 #endif -__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays, const double * stf_pre_compute, const int myrank, const int * islice_selected_source, const int * ispec_selected_source, const int NSOURCES){ +__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays_local, const float * stf_local, const int * ispec_selected_source_local, const int nsources_local, const int NSTEP, const int it, const int istage){ int ispec; int iglob; float stf; @@ -93,14 +93,12 @@ __global__ void compute_add_sources_kernel(float * accel, const int * ibool, con j = threadIdx.y; k = threadIdx.z; isource = blockIdx.x + (gridDim.x) * (blockIdx.y); - if (isource < NSOURCES) { - if (myrank == islice_selected_source[isource]) { - ispec = ispec_selected_source[isource] - (1); - stf = stf_pre_compute[isource]; - iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1); - atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf)); - atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf)); - atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf)); - } + if (isource < nsources_local) { + ispec = ispec_selected_source_local[isource] - (1); + stf = stf_local[INDEX3(nsources_local, NSTEP, isource, it, istage)]; + iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1); + atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf)); + atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf)); + atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf)); } } diff --git a/src/gpu/kernels.gen/compute_add_sources_kernel_cl.c b/src/gpu/kernels.gen/compute_add_sources_kernel_cl.c index e56969838..93bd4c2df 100644 --- a/src/gpu/kernels.gen/compute_add_sources_kernel_cl.c +++ b/src/gpu/kernels.gen/compute_add_sources_kernel_cl.c @@ -93,7 +93,7 @@ inline void atomicAdd(volatile __global float *source, const float val) {\n\ #define BLOCKSIZE_TRANSFER 256\n\ #endif\n\ \n\ -__kernel void compute_add_sources_kernel(__global float * accel, const __global int * ibool, const __global float * sourcearrays, const __global double * stf_pre_compute, const int myrank, const __global int * islice_selected_source, const __global int * ispec_selected_source, const int NSOURCES){\n\ +__kernel void compute_add_sources_kernel(__global float * accel, const __global int * ibool, const __global float * sourcearrays_local, const __global float * stf_local, const __global int * ispec_selected_source_local, const int nsources_local, const int NSTEP, const int it, const int istage){\n\ int ispec;\n\ int iglob;\n\ float stf;\n\ @@ -105,15 +105,13 @@ __kernel void compute_add_sources_kernel(__global float * accel, const __global j = get_local_id(1);\n\ k = get_local_id(2);\n\ isource = get_group_id(0) + (get_num_groups(0)) * (get_group_id(1));\n\ - if (isource < NSOURCES) {\n\ - if (myrank == islice_selected_source[isource]) {\n\ - ispec = ispec_selected_source[isource] - (1);\n\ - stf = stf_pre_compute[isource];\n\ - iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1);\n\ - atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf));\n\ - atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf));\n\ - atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf));\n\ - }\n\ + if (isource < nsources_local) {\n\ + ispec = ispec_selected_source_local[isource] - (1);\n\ + stf = stf_local[INDEX3(nsources_local, NSTEP, isource, it, istage)];\n\ + iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1);\n\ + atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf));\n\ + atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf));\n\ + atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf));\n\ }\n\ }\n\ "; diff --git a/src/gpu/kernels.gen/compute_seismograms_kernel.cpp b/src/gpu/kernels.gen/compute_seismograms_kernel.cpp index e2f3556ee..e691fb6ba 100644 --- a/src/gpu/kernels.gen/compute_seismograms_kernel.cpp +++ b/src/gpu/kernels.gen/compute_seismograms_kernel.cpp @@ -48,7 +48,7 @@ #define NDIM 3 #endif #ifndef NGLLX -#define NGLLX false +#define NGLLX 5 #endif #ifndef NGLL2 #define NGLL2 25 @@ -81,7 +81,7 @@ #define BLOCKSIZE_TRANSFER 256 #endif -__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ){ +__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ, const int seismo_current){ int ispec; int iglob; int irec_local; @@ -93,6 +93,7 @@ __global__ void compute_seismograms_kernel(const int nrec_local, const float * d int k; int l; int s; + int idx; __shared__ float sh_dxd[(NGLL3_PADDED)]; __shared__ float sh_dyd[(NGLL3_PADDED)]; __shared__ float sh_dzd[(NGLL3_PADDED)]; @@ -178,14 +179,16 @@ __global__ void compute_seismograms_kernel(const int nrec_local, const float * d __syncthreads(); l = (l) * (2); + idx = INDEX3(NDIM, nrec_local, 0, irec_local, seismo_current); + if (tx == 0) { - seismograms[(irec_local) * (3) + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0])); + seismograms[idx + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0])); } if (tx == 1) { - seismograms[(irec_local) * (3) + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0])); + seismograms[idx + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0])); } if (tx == 2) { - seismograms[(irec_local) * (3) + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0])); + seismograms[idx + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0])); } } } diff --git a/src/gpu/kernels.gen/compute_seismograms_kernel.cu b/src/gpu/kernels.gen/compute_seismograms_kernel.cu index e2f3556ee..e691fb6ba 100644 --- a/src/gpu/kernels.gen/compute_seismograms_kernel.cu +++ b/src/gpu/kernels.gen/compute_seismograms_kernel.cu @@ -48,7 +48,7 @@ #define NDIM 3 #endif #ifndef NGLLX -#define NGLLX false +#define NGLLX 5 #endif #ifndef NGLL2 #define NGLL2 25 @@ -81,7 +81,7 @@ #define BLOCKSIZE_TRANSFER 256 #endif -__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ){ +__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ, const int seismo_current){ int ispec; int iglob; int irec_local; @@ -93,6 +93,7 @@ __global__ void compute_seismograms_kernel(const int nrec_local, const float * d int k; int l; int s; + int idx; __shared__ float sh_dxd[(NGLL3_PADDED)]; __shared__ float sh_dyd[(NGLL3_PADDED)]; __shared__ float sh_dzd[(NGLL3_PADDED)]; @@ -178,14 +179,16 @@ __global__ void compute_seismograms_kernel(const int nrec_local, const float * d __syncthreads(); l = (l) * (2); + idx = INDEX3(NDIM, nrec_local, 0, irec_local, seismo_current); + if (tx == 0) { - seismograms[(irec_local) * (3) + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0])); + seismograms[idx + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0])); } if (tx == 1) { - seismograms[(irec_local) * (3) + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0])); + seismograms[idx + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0])); } if (tx == 2) { - seismograms[(irec_local) * (3) + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0])); + seismograms[idx + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0])); } } } diff --git a/src/gpu/kernels.gen/compute_seismograms_kernel_cl.c b/src/gpu/kernels.gen/compute_seismograms_kernel_cl.c index 24aa921fc..7423577c1 100644 --- a/src/gpu/kernels.gen/compute_seismograms_kernel_cl.c +++ b/src/gpu/kernels.gen/compute_seismograms_kernel_cl.c @@ -92,7 +92,7 @@ inline void atomicAdd(volatile __global float *source, const float val) {\n\ #define BLOCKSIZE_TRANSFER 256\n\ #endif\n\ \n\ -__kernel void compute_seismograms_kernel(const int nrec_local, const __global float * displ, const __global int * d_ibool, const __global float * hxir, const __global float * hetar, const __global float * hgammar, __global float * seismograms, const __global float * nu, const __global int * ispec_selected_rec, const __global int * number_receiver_global, const float scale_displ){\n\ +__kernel void compute_seismograms_kernel(const int nrec_local, const __global float * displ, const __global int * d_ibool, const __global float * hxir, const __global float * hetar, const __global float * hgammar, __global float * seismograms, const __global float * nu, const __global int * ispec_selected_rec, const __global int * number_receiver_global, const float scale_displ, const int seismo_current){\n\ int ispec;\n\ int iglob;\n\ int irec_local;\n\ @@ -104,6 +104,7 @@ __kernel void compute_seismograms_kernel(const int nrec_local, const __global fl int k;\n\ int l;\n\ int s;\n\ + int idx;\n\ __local float sh_dxd[(NGLL3_PADDED)];\n\ __local float sh_dyd[(NGLL3_PADDED)];\n\ __local float sh_dzd[(NGLL3_PADDED)];\n\ @@ -188,15 +189,17 @@ __kernel void compute_seismograms_kernel(const int nrec_local, const __global fl }\n\ barrier(CLK_LOCAL_MEM_FENCE);\n\ l = (l) * (2);\n\ +\n\ + idx = INDEX3(NDIM, nrec_local, 0, irec_local, seismo_current);\n\ \n\ if (tx == 0) {\n\ - seismograms[(irec_local) * (3) + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0]));\n\ + seismograms[idx + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0]));\n\ }\n\ if (tx == 1) {\n\ - seismograms[(irec_local) * (3) + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0]));\n\ + seismograms[idx + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0]));\n\ }\n\ if (tx == 2) {\n\ - seismograms[(irec_local) * (3) + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0]));\n\ + seismograms[idx + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0]));\n\ }\n\ }\n\ }\n\ diff --git a/src/gpu/kernels.gen/crust_mantle_impl_kernel_forward.cpp b/src/gpu/kernels.gen/crust_mantle_impl_kernel_forward.cpp index 14c66d904..ed1f8689d 100644 --- a/src/gpu/kernels.gen/crust_mantle_impl_kernel_forward.cpp +++ b/src/gpu/kernels.gen/crust_mantle_impl_kernel_forward.cpp @@ -82,6 +82,10 @@ #endif +#ifdef MANUALLY_UNROLLED_LOOPS +#pragma message ("\n\nCompiling with: MANUALLY_UNROLLED_LOOPS enabled\n") +#endif // MANUALLY_UNROLLED_LOOPS + static __device__ void compute_element_cm_att_stress(const int tx, const int working_element, const float * R_xx, const float * R_yy, const float * R_xy, const float * R_xz, const float * R_yz, float * sigma_xx, float * sigma_yy, float * sigma_zz, float * sigma_xy, float * sigma_xz, float * sigma_yz){ int offset; diff --git a/src/gpu/kernels.gen/kernel_proto.cu.h b/src/gpu/kernels.gen/kernel_proto.cu.h index 6cf3a6263..d482c0f40 100644 --- a/src/gpu/kernels.gen/kernel_proto.cu.h +++ b/src/gpu/kernels.gen/kernel_proto.cu.h @@ -2,7 +2,7 @@ __global__ void assemble_boundary_accel_on_device(float * d_accel, const float * __global__ void assemble_boundary_potential_on_device(float * d_potential_dot_dot_acoustic, const float * d_send_potential_dot_dot_buffer, const int num_interfaces, const int max_nibool_interfaces, const int * d_nibool_interfaces, const int * d_ibool_interfaces); __global__ void compute_acoustic_kernel(const int * ibool, const float * rhostore, const float * kappastore, const float * hprime_xx, const float * d_xix, const float * d_xiy, const float * d_xiz, const float * d_etax, const float * d_etay, const float * d_etaz, const float * d_gammax, const float * d_gammay, const float * d_gammaz, const float * potential_dot_dot_acoustic, const float * b_potential_acoustic, const float * b_potential_dot_dot_acoustic, float * rho_ac_kl, float * kappa_ac_kl, const float deltat, const int NSPEC); __global__ void compute_add_sources_adjoint_kernel(float * accel, const float * source_adjoint, const float * xir, const float * etar, const float * gammar, const int * ibool, const int * ispec_selected_rec, const int * number_adjsources_global, const int nadj_rec_local); -__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays, const double * stf_pre_compute, const int myrank, const int * islice_selected_source, const int * ispec_selected_source, const int NSOURCES); +__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays_local, const float * stf_local, const int * ispec_selected_source_local, const int nsources_local, const int NSTEP, const int it, const int istage); __global__ void compute_ani_kernel(const float * epsilondev_xx, const float * epsilondev_yy, const float * epsilondev_xy, const float * epsilondev_xz, const float * epsilondev_yz, const float * epsilon_trace_over_3, const float * b_epsilondev_xx, const float * b_epsilondev_yy, const float * b_epsilondev_xy, const float * b_epsilondev_xz, const float * b_epsilondev_yz, const float * b_epsilon_trace_over_3, float * cijkl_kl, const int NSPEC, const float deltat); __global__ void compute_ani_undoatt_kernel(const float * epsilondev_xx, const float * epsilondev_yy, const float * epsilondev_xy, const float * epsilondev_xz, const float * epsilondev_yz, const float * epsilon_trace_over_3, float * cijkl_kl, const int NSPEC, const float deltat, const int * d_ibool, const float * d_b_displ, const float * d_xix, const float * d_xiy, const float * d_xiz, const float * d_etax, const float * d_etay, const float * d_etaz, const float * d_gammax, const float * d_gammay, const float * d_gammaz, const float * d_hprime_xx); __global__ void compute_coupling_CMB_fluid_kernel(const float * displ_crust_mantle, float * accel_crust_mantle, const float * accel_outer_core, const int * ibool_crust_mantle, const int * ibelm_bottom_crust_mantle, const float * normal_top_outer_core, const float * jacobian2D_top_outer_core, const float * wgllwgll_xy, const int * ibool_outer_core, const int * ibelm_top_outer_core, const float RHO_TOP_OC, const float minus_g_cmb, const int GRAVITY, const int NSPEC2D_BOTTOM_CM); @@ -15,7 +15,7 @@ __global__ void compute_iso_kernel(const float * epsilondev_xx, const float * ep __global__ void compute_iso_undoatt_kernel(const float * epsilondev_xx, const float * epsilondev_yy, const float * epsilondev_xy, const float * epsilondev_xz, const float * epsilondev_yz, const float * epsilon_trace_over_3, float * mu_kl, float * kappa_kl, const int NSPEC, const float deltat, const int * d_ibool, const float * d_b_displ, const float * d_xix, const float * d_xiy, const float * d_xiz, const float * d_etax, const float * d_etay, const float * d_etaz, const float * d_gammax, const float * d_gammay, const float * d_gammaz, const float * d_hprime_xx); __global__ void compute_kappa_mu_hess_kernel(const int * d_ibool, const float * d_veloc, const float * d_b_veloc, const float * d_xix, const float * d_xiy, const float * d_xiz, const float * d_etax, const float * d_etay, const float * d_etaz, const float * d_gammax, const float * d_gammay, const float * d_gammaz, const float * d_hprime_xx, const float deltat, float * hess_rho_kl, float * hess_kappa_kl, float * hess_mu_kl, const int NSPEC, const int USE_SOURCE_RECEIVER_HESSIAN); __global__ void compute_rho_kernel(const int * ibool, const float * accel, const float * b_displ, float * rho_kl, const int NSPEC, const float deltat); -__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ); +__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ, const int seismo_current); __global__ void compute_stacey_acoustic_backward_kernel(float * b_potential_dot_dot_acoustic, const float * b_absorb_potential, const int num_abs_boundary_faces, const int * abs_boundary_ispec, const int * abs_boundary_npoin, const int * abs_boundary_ijk, const int * ibool); __global__ void compute_stacey_acoustic_kernel(const float * potential_dot_acoustic, float * potential_dot_dot_acoustic, const int num_abs_boundary_faces, const int * abs_boundary_ispec, const int * abs_boundary_npoin, const int * abs_boundary_ijk, const float * abs_boundary_jacobian2Dw, const int * ibool, const float * vpstore, const int SAVE_STACEY, float * b_absorb_potential); __global__ void compute_stacey_elastic_backward_kernel(float * b_accel, const float * b_absorb_field, const int num_abs_boundary_faces, const int * abs_boundary_ispec, const int * abs_boundary_npoin, const int * abs_boundary_ijk, const int * ibool); diff --git a/src/gpu/mesh_constants_cuda.h b/src/gpu/mesh_constants_cuda.h index cf6221bb3..6525c2dca 100644 --- a/src/gpu/mesh_constants_cuda.h +++ b/src/gpu/mesh_constants_cuda.h @@ -38,6 +38,8 @@ // leads up to ~10% performance increase in OpenCL and ~1% in Cuda #define MANUALLY_UNROLLED_LOOPS // uncomment to use loops +/*----------------------------------------------------------------------------------------------- */ + // definitions typedef cudaEvent_t gpu_event; typedef cudaStream_t gpu_stream; diff --git a/src/gpu/mesh_constants_gpu.h b/src/gpu/mesh_constants_gpu.h index af126018d..9a62b0d2b 100644 --- a/src/gpu/mesh_constants_gpu.h +++ b/src/gpu/mesh_constants_gpu.h @@ -144,6 +144,9 @@ typedef double realw; // debug: outputs maximum and preferred work group size (for OpenCL kernels) #define DEBUG_KERNEL_WORK_GROUP_SIZE 0 +// performance timers +#define GPU_TIMING 0 + // error checking after cuda function calls // (note: this synchronizes many calls, thus e.g. no asynchronous memcpy possible) #define ENABLE_VERY_SLOW_ERROR_CHECKING 0 @@ -939,10 +942,24 @@ typedef struct mesh_ { // sources // ------------------------------------------------------------------ // int nsources_local; - gpu_realw_mem d_sourcearrays; - gpu_double_mem d_stf_pre_compute; - gpu_int_mem d_islice_selected_source; - gpu_int_mem d_ispec_selected_source; + int NSTEP; + int NSTAGES; + gpu_int_mem d_ispec_selected_source_local; + gpu_realw_mem d_sourcearrays_local; + gpu_realw_mem d_stf_local; + + // for LDDRK and non-undo-attenuation cases + int use_b_stf; + gpu_realw_mem d_b_stf_local; + + // full NSOURCES arrays not needed anymore... + // we will move to store only local source arrays with full source time function traces + // to avoid memory copies before the compute_add_sources() kernel, as those memcopies synchronize streams and will slow down performance... + //gpu_realw_mem d_sourcearrays; + //gpu_double_mem d_stf_pre_compute; + //gpu_int_mem d_islice_selected_source; + + gpu_int_mem d_ispec_selected_source; // needed only for pure adjoint simulation cases // ------------------------------------------------------------------ // // receivers @@ -1285,6 +1302,11 @@ void exit_on_error (const char *info); void synchronize_mpi (); double get_time_val (); +// event timing +void start_timing_gpu(gpu_event* start, gpu_event* stop); +void stop_timing_gpu(gpu_event* start, gpu_event* stop, const char* info_str); +void stop_timing_gpu_t(gpu_event* start, gpu_event* stop, const char* info_str, realw* t); + // defined in check_fields_gpu.c void get_free_memory (double *free_db, double *used_db, double *total_db); realw get_device_array_maximum_value (gpu_realw_mem d_array, int size); diff --git a/src/gpu/mesh_constants_hip.h b/src/gpu/mesh_constants_hip.h index 8cc94a489..f9409bbe2 100644 --- a/src/gpu/mesh_constants_hip.h +++ b/src/gpu/mesh_constants_hip.h @@ -35,14 +35,23 @@ #ifdef USE_HIP // (optional) unrolling loops -// not tested yet for hip -#undef MANUALLY_UNROLLED_LOOPS // default to none +// leads up to ~3% performance increase with HIP +#define MANUALLY_UNROLLED_LOOPS // uncomment to use loops // AMD MI100 #ifdef GPU_DEVICE_MI100 #undef USE_LAUNCH_BOUNDS #endif +// AMD MI250X +#ifdef GPU_DEVICE_MI250 +#undef USE_LAUNCH_BOUNDS +//#define USE_LAUNCH_BOUNDS // will slow down kernels... +//#define LAUNCH_MIN_BLOCKS 7 +#endif + +/*----------------------------------------------------------------------------------------------- */ + // definitions typedef hipEvent_t gpu_event; typedef hipStream_t gpu_stream; diff --git a/src/gpu/prepare_mesh_constants_gpu.c b/src/gpu/prepare_mesh_constants_gpu.c index ab1e82743..a902ad7f4 100644 --- a/src/gpu/prepare_mesh_constants_gpu.c +++ b/src/gpu/prepare_mesh_constants_gpu.c @@ -82,9 +82,12 @@ void FC_FUNC_ (prepare_constants_device, int *h_NGLLX, realw *h_hprime_xx, realw *h_hprimewgll_xx, realw *h_wgllwgll_xy, realw *h_wgllwgll_xz, realw *h_wgllwgll_yz, - int *NSOURCES, int *nsources_local, - realw *h_sourcearrays, - int *h_islice_selected_source, int *h_ispec_selected_source, + int *NSOURCES, + int *nsources_local, + realw *h_sourcearrays_local, + realw *h_stf_local, realw *h_b_stf_local, + int *h_ispec_selected_source_local, + int *h_ispec_selected_source, int *nrec, int *nrec_local, int *h_number_receiver_global, int *h_islice_selected_rec, int *h_ispec_selected_rec, @@ -110,11 +113,18 @@ void FC_FUNC_ (prepare_constants_device, realw *deltat_f, int *GPU_ASYNC_COPY_f, double * h_hxir_store,double * h_hetar_store,double * h_hgammar_store,double * h_nu, + int* nlength_seismogram, int *SAVE_SEISMOGRAMS_STRAIN_f, - int *CUSTOM_REAL_f) { + int *CUSTOM_REAL_f, + int *USE_LDDRK_f, + int *NSTEP_f, int *NSTAGES_f) { TRACE ("prepare_constants_device"); + int size,size_padded; + int num_blocks_x,num_blocks_y; + int size_block_norm,size_block_norm_strain; + // allocates mesh parameter structure Mesh *mp = (Mesh *) malloc (sizeof (Mesh)); @@ -305,7 +315,7 @@ void FC_FUNC_ (prepare_constants_device, mp->NSPEC_INNER_CORE_STRAIN_ONLY = *NSPEC_INNER_CORE_STRAIN_ONLY; // simulation flags initialization - mp->use_lddrk = 0; + mp->use_lddrk = *USE_LDDRK_f; mp->save_forward = *SAVE_FORWARD_f; mp->absorbing_conditions = *ABSORBING_CONDITIONS_f; mp->oceans = *OCEANS_f; @@ -353,16 +363,41 @@ void FC_FUNC_ (prepare_constants_device, } // sources - mp->nsources_local = *nsources_local; + mp->nsources_local = 0; if (mp->simulation_type == 1 || mp->simulation_type == 3) { - // not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2) - gpuCreateCopy_todevice_realw (&mp->d_sourcearrays, h_sourcearrays, (*NSOURCES) * NDIM * NGLL3); + // only add CMT source for non-noise simulations and forward/kernel simulations + if (mp->noise_tomography == 0) { + mp->nsources_local = *nsources_local; + } + } + mp->NSTEP = *NSTEP_f; + mp->NSTAGES = *NSTAGES_f; + + // source arrays + // not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2) + // full NSOURCES arrays not needed anymore... + //gpuCreateCopy_todevice_realw (&mp->d_sourcearrays, h_sourcearrays, (*NSOURCES) * NDIM * NGLL3); + //gpuMalloc_double (&mp->d_stf_pre_compute, *NSOURCES); + //gpuCreateCopy_todevice_int (&mp->d_islice_selected_source, h_islice_selected_source, *NSOURCES); + // only needed for pure adjoint simulation cases + if (mp->simulation_type == 2){ + gpuCreateCopy_todevice_int (&mp->d_ispec_selected_source, h_ispec_selected_source, *NSOURCES); + } + + // local sources only... + mp->use_b_stf = 0; + if (mp->nsources_local > 0){ // allocates buffer on GPU for source time function values - gpuMalloc_double (&mp->d_stf_pre_compute, *NSOURCES); + gpuCreateCopy_todevice_realw (&mp->d_sourcearrays_local, h_sourcearrays_local, mp->nsources_local * NDIM * NGLL3); + gpuCreateCopy_todevice_realw (&mp->d_stf_local, h_stf_local, mp->nsources_local * mp->NSTEP * mp->NSTAGES); + gpuCreateCopy_todevice_int (&mp->d_ispec_selected_source_local, h_ispec_selected_source_local, mp->nsources_local); + // additional array for LDDRK and backward stepping + if (mp->simulation_type == 3 && mp->use_lddrk && (! mp->undo_attenuation)){ + mp->use_b_stf = 1; + gpuCreateCopy_todevice_realw (&mp->d_b_stf_local, h_b_stf_local, mp->nsources_local * mp->NSTEP * mp->NSTAGES); + } } - gpuCreateCopy_todevice_int (&mp->d_islice_selected_source, h_islice_selected_source, *NSOURCES); - gpuCreateCopy_todevice_int (&mp->d_ispec_selected_source, h_ispec_selected_source, *NSOURCES); // receiver stations // note that: size (number_receiver_global) = nrec_local @@ -371,13 +406,14 @@ void FC_FUNC_ (prepare_constants_device, mp->nrec_local = *nrec_local; if (mp->nrec_local > 0) { gpuCreateCopy_todevice_int (&mp->d_number_receiver_global, h_number_receiver_global, mp->nrec_local); + // for seismograms if (mp->simulation_type == 1 || mp->simulation_type == 3 ) { // forward/kernel simulations realw * xir = (realw *)malloc(NGLLX * mp->nrec_local*sizeof(realw)); realw * etar = (realw *)malloc(NGLLX * mp->nrec_local*sizeof(realw)); realw * gammar = (realw *)malloc(NGLLX * mp->nrec_local*sizeof(realw)); - // converts to double to realw arrays, assumes NGLLX == NGLLY == NGLLZ + // converts from double to realw arrays, assumes NGLLX == NGLLY == NGLLZ for (int i=0;inrec_local;i++){ xir[i] = (realw)h_hxir_store[i]; etar[i] = (realw)h_hetar_store[i]; @@ -391,7 +427,10 @@ void FC_FUNC_ (prepare_constants_device, free(gammar); // local seismograms - gpuMalloc_realw (&mp->d_seismograms, NDIM * mp->nrec_local); + // full seismograms length (considering sub-sampling) + size = (*nlength_seismogram) * mp->nrec_local; + gpuMalloc_realw (&mp->d_seismograms, NDIM * size); + gpuMemset_realw (&mp->d_seismograms, NDIM * size, 0); // orientation realw* nu; @@ -465,12 +504,7 @@ void FC_FUNC_ (prepare_constants_device, // buffer for norm checking at every timestamp int blocksize = BLOCKSIZE_TRANSFER; - int size,size_padded; - int num_blocks_x,num_blocks_y; - // buffer for crust_mantle arrays has maximum size - int size_block_norm,size_block_norm_strain; - // norm arrays size = mp->NGLOB_CRUST_MANTLE; @@ -626,6 +660,7 @@ void FC_FUNC_ (prepare_constants_adjoint_device, if (mp->h_stf_array_adjoint == NULL) exit_on_error ("h_stf_array_adjoint not allocated\n"); } gpuMalloc_realw (&mp->d_stf_array_adjoint, mp->nadj_rec_local * NDIM ); + gpuMemset_realw (&mp->d_stf_array_adjoint, mp->nadj_rec_local * NDIM, 0); } // synchronizes gpu calls @@ -1042,6 +1077,7 @@ void FC_FUNC_ (prepare_fields_absorb_device, // boundary buffer if (mp->save_stacey) { gpuMalloc_realw (&mp->d_absorb_buffer_crust_mantle, NDIM * NGLLSQUARE * num_abs_boundary_faces); + gpuMemset_realw (&mp->d_absorb_buffer_crust_mantle, NDIM * NGLLSQUARE * num_abs_boundary_faces, 0); } } @@ -1068,6 +1104,7 @@ void FC_FUNC_ (prepare_fields_absorb_device, // boundary buffer if (mp->save_stacey) { gpuMalloc_realw (&mp->d_absorb_buffer_outer_core, NGLLSQUARE * num_abs_boundary_faces); + gpuMemset_realw (&mp->d_absorb_buffer_outer_core, NGLLSQUARE * num_abs_boundary_faces, 0); } } @@ -1344,6 +1381,7 @@ void FC_FUNC_ (prepare_fields_noise_device, // alloc storage for the surface buffer to be copied gpuMalloc_realw (&mp->d_noise_surface_movie, NDIM * NGLL2 * mp->nspec2D_top_crust_mantle); + gpuMemset_realw (&mp->d_noise_surface_movie, NDIM * NGLL2 * mp->nspec2D_top_crust_mantle, 0); } else { // for global mesh: each crust/mantle slice should have at top a free surface @@ -1352,7 +1390,9 @@ void FC_FUNC_ (prepare_fields_noise_device, // prepares noise source array if (mp->noise_tomography == 1) { - gpuCreateCopy_todevice_realw (&mp->d_noise_sourcearray, noise_sourcearray, NDIM*NGLL3 * (*NSTEP)); + // checks with setup NSTEP + if (mp->NSTEP != *NSTEP){ exit_on_error("Error invalid NSTEP setup for prepare_fields_noise_device() routine"); } + gpuCreateCopy_todevice_realw (&mp->d_noise_sourcearray, noise_sourcearray, NDIM * NGLL3 * mp->NSTEP); } // prepares noise directions @@ -1435,8 +1475,8 @@ void FC_FUNC_ (prepare_lddrk_device, Mesh *mp = (Mesh *) *Mesh_pointer_f; size_t size; - // sets flag - mp->use_lddrk = 1; + // checks flag + if (! mp->use_lddrk){ exit_on_error("Flag use_lddrk is not set properly for routine prepare_lddrk_device()"); } // note: we don't support yet reading initial wavefields for re-starting simulations with LDDRK. // this would require to store and copy also the **_lddrk wavefields to the restart files which is not done yet. @@ -2712,12 +2752,14 @@ void FC_FUNC_ (prepare_cleanup_device, //------------------------------------------ // sources //------------------------------------------ - if (mp->simulation_type == 1 || mp->simulation_type == 3) { - gpuFree (&mp->d_sourcearrays); - gpuFree (&mp->d_stf_pre_compute); + if (mp->nsources_local > 0){ + gpuFree (&mp->d_sourcearrays_local); + gpuFree (&mp->d_stf_local); + gpuFree (&mp->d_ispec_selected_source_local); + } + if (mp->simulation_type == 2){ + gpuFree (&mp->d_ispec_selected_source); } - gpuFree (&mp->d_islice_selected_source); - gpuFree (&mp->d_ispec_selected_source); //------------------------------------------ // receivers diff --git a/src/gpu/specfem3D_gpu_method_stubs.c b/src/gpu/specfem3D_gpu_method_stubs.c index c8aad67e6..592c9a0ab 100644 --- a/src/gpu/specfem3D_gpu_method_stubs.c +++ b/src/gpu/specfem3D_gpu_method_stubs.c @@ -110,13 +110,13 @@ void FC_FUNC_ (check_norm_strain_from_device, void FC_FUNC_ (compute_add_sources_gpu, COMPUTE_ADD_SOURCES_GPU) (long *Mesh_pointer_f, - int *NSOURCESf, - double *h_stf_pre_compute) {} + int *it_f, + int *istage_f) {} void FC_FUNC_ (compute_add_sources_backward_gpu, COMPUTE_ADD_SOURCES_BACKWARD_GPU) (long *Mesh_pointer_f, - int *NSOURCESf, - double *h_stf_pre_compute) {} + int *it_tmp_f, + int *istage_f) {} void FC_FUNC_ (compute_add_sources_adjoint_gpu, COMPUTE_ADD_SOURCES_ADJOINT_GPU) (long *Mesh_pointer_f) {} @@ -224,14 +224,13 @@ void FC_FUNC_ (resort_array, // void FC_FUNC_ (compute_seismograms_gpu, - COMPUTE_SEISMOGRAMS_GPU) (long *Mesh_pointer_f, + COMPUTE_SEISMOGRAMS_GPU) (long *Mesh_pointer, realw* seismograms, - int* seismo_currentf, - int* itf, - int* it_endf, - double* scale_displf, - int* NTSTEP_BETWEEN_OUTPUT_SEISMOSf, - int* NSTEPf) {} + int* seismo_current_f, + int* it_f, + int* it_end_f, + double* scale_displ_f, + int* nlength_seismogram_f) {} // @@ -286,6 +285,9 @@ void FC_FUNC_ (compute_strain_gpu, void FC_FUNC_ (pause_for_debug, PAUSE_FOR_DEBUG) () {} +void FC_FUNC_ (gpu_synchronize, + GPU_SYNCHRONIZE) () {} + void FC_FUNC_ (allocate_gpu_buffer, ALLOCATE_GPU_BUFFER) (realw** buffer_f, int* total_size) {} @@ -334,9 +336,12 @@ void FC_FUNC_ (prepare_constants_device, int *h_NGLLX, realw *h_hprime_xx, realw *h_hprimewgll_xx, realw *h_wgllwgll_xy, realw *h_wgllwgll_xz, realw *h_wgllwgll_yz, - int *NSOURCES, int *nsources_local, - realw *h_sourcearrays, - int *h_islice_selected_source, int *h_ispec_selected_source, + int *NSOURCES, + int *nsources_local, + realw *h_sourcearrays_local, + realw *h_stf_local, realw *h_b_stf_local, + int *h_ispec_selected_source_local, + int *h_ispec_selected_source, int *nrec, int *nrec_local, int *h_number_receiver_global, int *h_islice_selected_rec, int *h_ispec_selected_rec, @@ -362,8 +367,11 @@ void FC_FUNC_ (prepare_constants_device, realw *deltat_f, int *GPU_ASYNC_COPY_f, double * h_hxir_store,double * h_hetar_store,double * h_hgammar_store,double * h_nu, + int* nlength_seismogram, int *SAVE_SEISMOGRAMS_STRAIN_f, - int *CUSTOM_REAL_f) {} + int *CUSTOM_REAL_f, + int *USE_LDDRK_f, + int *NSTEP_f, int *NSTAGES_f) {} void FC_FUNC_ (prepare_constants_adjoint_device, PREPARE_CONSTANTS_ADJOINT_DEVICE) (long *Mesh_pointer_f, diff --git a/src/gpu/update_displacement_LDDRK_gpu.c b/src/gpu/update_displacement_LDDRK_gpu.c index 59347e3ce..b157050f4 100644 --- a/src/gpu/update_displacement_LDDRK_gpu.c +++ b/src/gpu/update_displacement_LDDRK_gpu.c @@ -83,6 +83,10 @@ void FC_FUNC_ (update_displ_lddrk_gpu, accel_ic = mp->d_b_accel_inner_core; } + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + gpuMemset_realw (&accel_cm, size_cm, 0); gpuMemset_realw (&accel_oc, size_oc, 0); gpuMemset_realw (&accel_ic, size_ic, 0); @@ -136,6 +140,9 @@ void FC_FUNC_ (update_displ_lddrk_gpu, #endif */ + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_displ_lddrk_gpu"); } + GPU_ERROR_CHECKING ("update_displ_lddrk_gpu"); } @@ -196,6 +203,10 @@ void FC_FUNC_ (update_elastic_lddrk_gpu, deltat = mp->b_deltat; } + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -383,6 +394,9 @@ void FC_FUNC_ (update_elastic_lddrk_gpu, } #endif + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_elastic_lddrk_gpu"); } + GPU_ERROR_CHECKING ("after update_elastic_lddrk_gpu"); } @@ -439,6 +453,10 @@ void FC_FUNC_ (update_acoustic_lddrk_gpu, deltat = mp->b_deltat; } + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -534,5 +552,8 @@ void FC_FUNC_ (update_acoustic_lddrk_gpu, } #endif + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_acoustic_lddrk_gpu"); } + GPU_ERROR_CHECKING ("after update_acoustic_lddrk_gpu"); } diff --git a/src/gpu/update_displacement_gpu.c b/src/gpu/update_displacement_gpu.c index 5033a3114..908b7e13b 100644 --- a/src/gpu/update_displacement_gpu.c +++ b/src/gpu/update_displacement_gpu.c @@ -92,6 +92,10 @@ void FC_FUNC_ (update_displacement_ic_gpu, accel = mp->d_b_accel_inner_core; } + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -142,6 +146,9 @@ void FC_FUNC_ (update_displacement_ic_gpu, } #endif + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_displacement_ic_gpu"); } + GPU_ERROR_CHECKING ("update_displacement_ic_gpu"); } @@ -210,6 +217,10 @@ void FC_FUNC_ (update_displacement_cm_gpu, accel = mp->d_b_accel_crust_mantle; } + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -260,6 +271,9 @@ void FC_FUNC_ (update_displacement_cm_gpu, } #endif + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_displacement_cm_gpu"); } + GPU_ERROR_CHECKING ("update_displacement_cm_gpu"); } @@ -328,6 +342,10 @@ void FC_FUNC_ (update_displacement_oc_gpu, accel = mp->d_b_accel_outer_core; } + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -378,6 +396,9 @@ void FC_FUNC_ (update_displacement_oc_gpu, } #endif + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_displacement_oc_gpu"); } + GPU_ERROR_CHECKING ("update_displacement_oc_gpu"); } @@ -442,6 +463,9 @@ void FC_FUNC_ (multiply_accel_elastic_gpu, } #endif + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } #ifdef USE_OPENCL if (run_opencl) { @@ -590,6 +614,9 @@ void FC_FUNC_ (multiply_accel_elastic_gpu, #endif } + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"multiply_accel_elastic_gpu"); } + GPU_ERROR_CHECKING ("after multiply_accel_elastic_gpu"); } @@ -638,6 +665,10 @@ void FC_FUNC_ (update_veloc_elastic_gpu, accel = mp->d_b_accel_crust_mantle; } + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -797,6 +828,9 @@ void FC_FUNC_ (update_veloc_elastic_gpu, #endif #endif + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_veloc_3_b"); } + GPU_ERROR_CHECKING ("after update_veloc_3_b"); } @@ -851,6 +885,10 @@ void FC_FUNC_ (multiply_accel_acoustic_gpu, } #endif + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -901,6 +939,9 @@ void FC_FUNC_ (multiply_accel_acoustic_gpu, } #endif + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"multiply_accel_acoustic_gpu"); } + GPU_ERROR_CHECKING ("after multiply_accel_acoustic_gpu"); } @@ -949,6 +990,10 @@ void FC_FUNC_ (update_veloc_acoustic_gpu, accel = mp->d_b_accel_outer_core; } + // kernel timing + gpu_event start,stop; + if (GPU_TIMING){ start_timing_gpu(&start,&stop); } + #ifdef USE_OPENCL if (run_opencl) { size_t global_work_size[2]; @@ -1038,5 +1083,8 @@ void FC_FUNC_ (update_veloc_acoustic_gpu, #endif #endif + // kernel timing + if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_veloc_acoustic_gpu"); } + GPU_ERROR_CHECKING ("after update_veloc_acoustic_gpu"); } diff --git a/src/meshfem3D/create_central_cube.f90 b/src/meshfem3D/create_central_cube.f90 index 6a7a258d3..63d9343c7 100644 --- a/src/meshfem3D/create_central_cube.f90 +++ b/src/meshfem3D/create_central_cube.f90 @@ -41,7 +41,7 @@ subroutine create_central_cube(ichunk,ispec_count,ipass, & CHUNK_AC,CHUNK_AC_ANTIPODE, & CHUNK_BC,CHUNK_BC_ANTIPODE - use shared_parameters, only: ratio_divide_central_cube,R_PLANET + use shared_parameters, only: ratio_divide_central_cube,R_PLANET,NPROCTOT use meshfem_par, only: & xstore,ystore,zstore @@ -58,7 +58,7 @@ subroutine create_central_cube(ichunk,ispec_count,ipass, & integer, intent(inout) :: ispec_count -! correct number of spectral elements in each block depending on chunk type + ! correct number of spectral elements in each block depending on chunk type integer, intent(in) :: nspec integer, intent(in) :: NEX_XI,NEX_PER_PROC_XI,NEX_PER_PROC_ETA integer, intent(in) :: iproc_xi,iproc_eta @@ -66,10 +66,10 @@ subroutine create_central_cube(ichunk,ispec_count,ipass, & double precision, intent(in) :: R_CENTRAL_CUBE -! MPI cut-planes parameters along xi and along eta + ! MPI cut-planes parameters along xi and along eta logical, dimension(2,nspec), intent(inout) :: iMPIcut_xi,iMPIcut_eta -! code for the four regions of the mesh + ! code for the four regions of the mesh integer, intent(in) :: iregion_code integer, intent(inout) :: idoubling(nspec) @@ -164,20 +164,20 @@ subroutine create_central_cube(ichunk,ispec_count,ipass, & ! new get_flag_boundaries ! xmin & xmax if (ix == 0) then - iMPIcut_xi(1,ispec_count) = .true. + if (NPROCTOT > 1) iMPIcut_xi(1,ispec_count) = .true. if (iproc_xi == 0) iboun(1,ispec_count)= .true. endif if (ix == 2*nx_central_cube-2) then - iMPIcut_xi(2,ispec_count) = .true. + if (NPROCTOT > 1) iMPIcut_xi(2,ispec_count) = .true. if (iproc_xi == NPROC_XI-1) iboun(2,ispec_count)= .true. endif ! ymin & ymax if (iy == 0) then - iMPIcut_eta(1,ispec_count) = .true. + if (NPROCTOT > 1) iMPIcut_eta(1,ispec_count) = .true. if (iproc_eta == 0) iboun(3,ispec_count)= .true. endif if (iy == 2*ny_central_cube-2) then - iMPIcut_eta(2,ispec_count) = .true. + if (NPROCTOT > 1) iMPIcut_eta(2,ispec_count) = .true. if (iproc_eta == NPROC_ETA-1) iboun(4,ispec_count)= .true. endif diff --git a/src/meshfem3D/create_doubling_elements.f90 b/src/meshfem3D/create_doubling_elements.f90 index 250f31d68..a8efb0139 100644 --- a/src/meshfem3D/create_doubling_elements.f90 +++ b/src/meshfem3D/create_doubling_elements.f90 @@ -51,7 +51,7 @@ subroutine create_doubling_elements(ilayer,ichunk,ispec_count,ipass, & IREGION_CRUST_MANTLE,IREGION_OUTER_CORE, & SAVE_BOUNDARY_MESH - use shared_parameters, only: ner_mesh_layers,ratio_sampling_array,doubling_index,r_bottom,r_top + use shared_parameters, only: ner_mesh_layers,ratio_sampling_array,doubling_index,r_bottom,r_top,NPROCTOT use meshfem_models_par, only: HONOR_1D_SPHERICAL_MOHO @@ -308,20 +308,20 @@ subroutine create_doubling_elements(ilayer,ichunk,ispec_count,ipass, & ! new get_flag_boundaries ! xmin & xmax if (ix_elem == 1) then - iMPIcut_xi(1,ispec_loc) = iboun_sb(ispec_superbrick,1) + if (NPROCTOT > 1) iMPIcut_xi(1,ispec_loc) = iboun_sb(ispec_superbrick,1) if (iproc_xi == 0) iboun(1,ispec_loc)= iboun_sb(ispec_superbrick,1) endif if (ix_elem == (NEX_PER_PROC_XI-step_mult*ratio_sampling_array(ilayer)+1)) then - iMPIcut_xi(2,ispec_loc) = iboun_sb(ispec_superbrick,2) + if (NPROCTOT > 1) iMPIcut_xi(2,ispec_loc) = iboun_sb(ispec_superbrick,2) if (iproc_xi == NPROC_XI-1) iboun(2,ispec_loc)= iboun_sb(ispec_superbrick,2) endif !! ymin & ymax if (iy_elem == 1) then - iMPIcut_eta(1,ispec_loc) = iboun_sb(ispec_superbrick,3) + if (NPROCTOT > 1) iMPIcut_eta(1,ispec_loc) = iboun_sb(ispec_superbrick,3) if (iproc_eta == 0) iboun(3,ispec_loc)= iboun_sb(ispec_superbrick,3) endif if (iy_elem == (NEX_PER_PROC_ETA-step_mult*ratio_sampling_array(ilayer)+1)) then - iMPIcut_eta(2,ispec_loc) = iboun_sb(ispec_superbrick,4) + if (NPROCTOT > 1) iMPIcut_eta(2,ispec_loc) = iboun_sb(ispec_superbrick,4) if (iproc_eta == NPROC_ETA-1) iboun(4,ispec_loc)= iboun_sb(ispec_superbrick,4) endif ! zmax only diff --git a/src/meshfem3D/create_regions_mesh.F90 b/src/meshfem3D/create_regions_mesh.F90 index 6fcda0876..59fb8235b 100644 --- a/src/meshfem3D/create_regions_mesh.F90 +++ b/src/meshfem3D/create_regions_mesh.F90 @@ -809,7 +809,6 @@ subroutine crm_allocate_arrays(ipass, & allocate(iMPIcut_xi(2,nspec), & iMPIcut_eta(2,nspec),stat=ier) if (ier /= 0) stop 'Error in allocate 15' - iMPIcut_xi(:,:) = .false.; iMPIcut_eta(:,:) = .false. ! MPI buffer indices @@ -1267,7 +1266,8 @@ subroutine crm_setup_mpi_buffers(npointot) xstore,ystore,zstore, & NSPEC1D_RADIAL_CORNER,NGLOB1D_RADIAL_CORNER, & NSPEC2D_XI_FACE,NSPEC2D_ETA_FACE, & - NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX + NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX, & + NPROCTOT use regions_mesh_par2, only: prname,iMPIcut_xi,iMPIcut_eta @@ -1290,7 +1290,7 @@ subroutine crm_setup_mpi_buffers(npointot) iboolright_xi(:) = 0 iboolright_eta(:) = 0 - if (nspec > 0) then + if (nspec > 0 .and. NPROCTOT > 1) then ! arrays mask_ibool(npointot) used to save memory ! allocate memory for arrays allocate(mask_ibool(npointot),stat=ier) diff --git a/src/meshfem3D/create_regular_elements.f90 b/src/meshfem3D/create_regular_elements.f90 index 1de56f7b8..15cfba6e6 100644 --- a/src/meshfem3D/create_regular_elements.f90 +++ b/src/meshfem3D/create_regular_elements.f90 @@ -49,7 +49,7 @@ subroutine create_regular_elements(ilayer,ichunk,ispec_count,ipass, & use constants, only: myrank,NDIM,CUSTOM_REAL,NGLLX,NGLLY,NGNOD,NGNOD_EIGHT_CORNERS,SUPPRESS_CRUSTAL_MESH, & SAVE_BOUNDARY_MESH,IREGION_CRUST_MANTLE,IMAIN - use shared_parameters, only: ner_mesh_layers,ratio_sampling_array,doubling_index,r_bottom,r_top + use shared_parameters, only: ner_mesh_layers,ratio_sampling_array,doubling_index,r_bottom,r_top,NPROCTOT use meshfem_models_par, only: HONOR_1D_SPHERICAL_MOHO,CASE_3D @@ -240,20 +240,20 @@ subroutine create_regular_elements(ilayer,ichunk,ispec_count,ipass, & ! new get_flag_boundaries ! xmin & xmax if (ix_elem == 1) then - iMPIcut_xi(1,ispec_loc) = .true. + if (NPROCTOT > 1) iMPIcut_xi(1,ispec_loc) = .true. if (iproc_xi == 0) iboun(1,ispec_loc) = .true. endif if (ix_elem == (NEX_PER_PROC_XI-ratio_sampling_array(ilayer)+1)) then - iMPIcut_xi(2,ispec_loc) = .true. + if (NPROCTOT > 1) iMPIcut_xi(2,ispec_loc) = .true. if (iproc_xi == NPROC_XI-1) iboun(2,ispec_loc) = .true. endif ! ymin & ymax if (iy_elem == 1) then - iMPIcut_eta(1,ispec_loc) = .true. + if (NPROCTOT > 1) iMPIcut_eta(1,ispec_loc) = .true. if (iproc_eta == 0) iboun(3,ispec_loc) = .true. endif if (iy_elem == (NEX_PER_PROC_ETA-ratio_sampling_array(ilayer)+1)) then - iMPIcut_eta(2,ispec_loc) = .true. + if (NPROCTOT > 1) iMPIcut_eta(2,ispec_loc) = .true. if (iproc_eta == NPROC_ETA-1) iboun(4,ispec_loc) = .true. endif ! zmin & zmax diff --git a/src/meshfem3D/fix_non_blocking_flags.f90 b/src/meshfem3D/fix_non_blocking_flags.f90 index c33e8a2f3..dfef9665a 100644 --- a/src/meshfem3D/fix_non_blocking_flags.f90 +++ b/src/meshfem3D/fix_non_blocking_flags.f90 @@ -37,27 +37,27 @@ subroutine fix_non_blocking_slices(is_on_a_slice_edge,iboolright_xi, & implicit none - integer :: nspec,nglob,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX + integer, intent(in) :: nspec,nglob,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX - integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi,npoin2D_eta + integer, dimension(NB_SQUARE_EDGES_ONEDIR), intent(in) :: npoin2D_xi,npoin2D_eta - logical, dimension(nspec) :: is_on_a_slice_edge + logical, dimension(nspec), intent(inout) :: is_on_a_slice_edge - integer, dimension(NGLOB2DMAX_XMIN_XMAX) :: iboolleft_xi,iboolright_xi - integer, dimension(NGLOB2DMAX_YMIN_YMAX) :: iboolleft_eta,iboolright_eta + integer, dimension(NGLOB2DMAX_XMIN_XMAX), intent(in) :: iboolleft_xi,iboolright_xi + integer, dimension(NGLOB2DMAX_YMIN_YMAX), intent(in) :: iboolleft_eta,iboolright_eta - integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec), intent(in) :: ibool ! local parameters logical, dimension(:),allocatable :: mask_ibool integer :: ipoin,ispec,i,j,k,ier -! clean the mask + ! clean the mask allocate(mask_ibool(nglob),stat=ier) if (ier /= 0) stop 'Error allocating mask in fix_non_blocking_slices routine' mask_ibool(:) = .false. -! mark all the points that are in the MPI buffers to assemble inside each chunk + ! mark all the points that are in the MPI buffers to assemble inside each chunk do ipoin = 1,npoin2D_xi(1) mask_ibool(iboolleft_xi(ipoin)) = .true. enddo @@ -74,11 +74,11 @@ subroutine fix_non_blocking_slices(is_on_a_slice_edge,iboolright_xi, & mask_ibool(iboolright_eta(ipoin)) = .true. enddo -! now label all the elements that have at least one corner belonging -! to any of these buffers as elements that must contribute to the -! first step of the calculations (performed on the edges before starting -! the non blocking communications); there is no need to examine the inside -! of the elements, checking their eight corners is sufficient + ! now label all the elements that have at least one corner belonging + ! to any of these buffers as elements that must contribute to the + ! first step of the calculations (performed on the edges before starting + ! the non blocking communications); there is no need to examine the inside + ! of the elements, checking their eight corners is sufficient do ispec = 1,nspec do k = 1,NGLLZ,NGLLZ-1 do j = 1,NGLLY,NGLLY-1 @@ -114,18 +114,18 @@ subroutine fix_non_blocking_central_cube(is_on_a_slice_edge, & implicit none - integer,intent(in) :: nspec,nglob,nb_msgs_theor_in_cube,NSPEC2D_BOTTOM_INNER_CORE - integer,intent(in) :: ichunk,npoin2D_cube_from_slices,NPROC_XI + integer, intent(in) :: nspec,nglob,nb_msgs_theor_in_cube,NSPEC2D_BOTTOM_INNER_CORE + integer, intent(in) :: ichunk,npoin2D_cube_from_slices,NPROC_XI - logical, dimension(nspec),intent(inout) :: is_on_a_slice_edge + logical, dimension(nspec), intent(inout) :: is_on_a_slice_edge - integer, dimension(NGLLX,NGLLY,NGLLZ,nspec),intent(in) :: ibool + integer, dimension(NGLLX,NGLLY,NGLLZ,nspec), intent(in) :: ibool - integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices),intent(in) :: ibool_central_cube + integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices), intent(in) :: ibool_central_cube - integer, dimension(NSPEC2D_BOTTOM_INNER_CORE),intent(in) :: ibelm_bottom_inner_core + integer, dimension(NSPEC2D_BOTTOM_INNER_CORE), intent(in) :: ibelm_bottom_inner_core - integer, dimension(nspec),intent(in) :: idoubling_inner_core + integer, dimension(nspec), intent(in) :: idoubling_inner_core ! local parameters logical, dimension(:),allocatable :: mask_ibool diff --git a/src/meshfem3D/get_MPI_1D_buffers.f90 b/src/meshfem3D/get_MPI_1D_buffers.f90 index 94207b1e8..34186dbfb 100644 --- a/src/meshfem3D/get_MPI_1D_buffers.f90 +++ b/src/meshfem3D/get_MPI_1D_buffers.f90 @@ -243,9 +243,7 @@ subroutine get_MPI_1D_buffers(prname,nspec,iMPIcut_xi,iMPIcut_eta, & ! corner detection here if (iMPIcut_xi(1,ispec) .and. iMPIcut_eta(2,ispec)) then - ispeccount = ispeccount + 1 - ! loop on all the points ix = 1 iy = NGLLY @@ -310,14 +308,11 @@ subroutine get_MPI_1D_buffers(prname,nspec,iMPIcut_xi,iMPIcut_eta, & ! corner detection here if (iMPIcut_xi(2,ispec) .and. iMPIcut_eta(2,ispec)) then - ispeccount = ispeccount + 1 - ! loop on all the points ix = NGLLX iy = NGLLY do iz = 1,NGLLZ - ! select point, if not already selected if (.not. mask_ibool(ibool(ix,iy,iz,ispec))) then mask_ibool(ibool(ix,iy,iz,ispec)) = .true. diff --git a/src/meshfem3D/get_MPI_cutplanes_xi.f90 b/src/meshfem3D/get_MPI_cutplanes_xi.f90 index 4684f1760..bd3d93eac 100644 --- a/src/meshfem3D/get_MPI_cutplanes_xi.f90 +++ b/src/meshfem3D/get_MPI_cutplanes_xi.f90 @@ -125,24 +125,24 @@ subroutine get_MPI_cutplanes_xi(prname,nspec,iMPIcut_xi,ibool, & ! loop on all the points in that 2-D element, including edges ix = 1 do iy = 1,NGLLY - do iz = 1,NGLLZ - ! select point, if not already selected - if (.not. mask_ibool(ibool(ix,iy,iz,ispec))) then - mask_ibool(ibool(ix,iy,iz,ispec)) = .true. - npoin2D_xi = npoin2D_xi + 1 - - ! fills buffer arrays - iboolleft_xi(npoin2D_xi) = ibool(ix,iy,iz,ispec) - - npoin2D_xi_all(1) = npoin2D_xi_all(1) + 1 - - ! debug file output - if (DEBUG) then - write(IOUT,*) ibool(ix,iy,iz,ispec), xstore(ix,iy,iz,ispec), & - ystore(ix,iy,iz,ispec),zstore(ix,iy,iz,ispec) - endif + do iz = 1,NGLLZ + ! select point, if not already selected + if (.not. mask_ibool(ibool(ix,iy,iz,ispec))) then + mask_ibool(ibool(ix,iy,iz,ispec)) = .true. + npoin2D_xi = npoin2D_xi + 1 + + ! fills buffer arrays + iboolleft_xi(npoin2D_xi) = ibool(ix,iy,iz,ispec) + + npoin2D_xi_all(1) = npoin2D_xi_all(1) + 1 + + ! debug file output + if (DEBUG) then + write(IOUT,*) ibool(ix,iy,iz,ispec), xstore(ix,iy,iz,ispec), & + ystore(ix,iy,iz,ispec),zstore(ix,iy,iz,ispec) endif - enddo + endif + enddo enddo endif enddo diff --git a/src/meshfem3D/get_jacobian_boundaries.f90 b/src/meshfem3D/get_jacobian_boundaries.f90 index 2f61c4730..c85ba40b7 100644 --- a/src/meshfem3D/get_jacobian_boundaries.f90 +++ b/src/meshfem3D/get_jacobian_boundaries.f90 @@ -109,8 +109,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, & if (iboun(1,ispec)) then - ispecb1=ispecb1+1 - ibelm_xmin(ispecb1)=ispec + ispecb1 = ispecb1+1 + ibelm_xmin(ispecb1) = ispec if (.not. USE_GLL) then ! specify the 9 nodes for the 2-D boundary element @@ -164,8 +164,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, & if (iboun(2,ispec)) then - ispecb2=ispecb2+1 - ibelm_xmax(ispecb2)=ispec + ispecb2 = ispecb2+1 + ibelm_xmax(ispecb2) = ispec if (.not. USE_GLL) then ! specify the 9 nodes for the 2-D boundary element @@ -220,8 +220,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, & if (iboun(3,ispec)) then - ispecb3=ispecb3+1 - ibelm_ymin(ispecb3)=ispec + ispecb3 = ispecb3+1 + ibelm_ymin(ispecb3) = ispec if (.not. USE_GLL) then ! specify the 9 nodes for the 2-D boundary element @@ -276,8 +276,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, & if (iboun(4,ispec)) then - ispecb4=ispecb4+1 - ibelm_ymax(ispecb4)=ispec + ispecb4 = ispecb4+1 + ibelm_ymax(ispecb4) = ispec if (.not. USE_GLL) then ! specify the 9 nodes for the 2-D boundary element @@ -332,8 +332,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, & if (iboun(5,ispec)) then - ispecb5=ispecb5+1 - ibelm_bottom(ispecb5)=ispec + ispecb5 = ispecb5+1 + ibelm_bottom(ispecb5) = ispec if (.not. USE_GLL) then xelm(1)=xstore(1,1,1,ispec) @@ -388,8 +388,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, & if (iboun(6,ispec)) then - ispecb6=ispecb6+1 - ibelm_top(ispecb6)=ispec + ispecb6 = ispecb6+1 + ibelm_top(ispecb6) = ispec if (.not. USE_GLL) then xelm(1)=xstore(1,1,NGLLZ,ispec) diff --git a/src/meshfem3D/meshfem3D_models.F90 b/src/meshfem3D/meshfem3D_models.F90 index 242b4b02b..1b702d5ac 100644 --- a/src/meshfem3D/meshfem3D_models.F90 +++ b/src/meshfem3D/meshfem3D_models.F90 @@ -1548,7 +1548,7 @@ subroutine meshfem3D_models_getatten_val(idoubling,r_prem,theta,phi, & use model_prem_par, only: PREM_RMOHO use shared_parameters, only: ABSORB_USING_GLOBAL_SPONGE, & - SPONGE_LATITUDE_IN_DEGREES,SPONGE_LONGITUDE_IN_DEGREES,SPONGE_RADIUS_IN_DEGREES + SPONGE_LATITUDE_IN_DEGREES,SPONGE_LONGITUDE_IN_DEGREES,SPONGE_RADIUS_IN_DEGREES implicit none diff --git a/src/meshfem3D/model_EMC.f90 b/src/meshfem3D/model_EMC.f90 index 4e62e3df9..8ab7ef3f4 100644 --- a/src/meshfem3D/model_EMC.f90 +++ b/src/meshfem3D/model_EMC.f90 @@ -98,7 +98,7 @@ module model_emc_par ! or if partial EMC models are just not supported. ! ! by default, we will use Brocher scaling relations: - ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth’s Crust, BSSA + ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth's Crust, BSSA ! note that these scaling relations also would have limitations (range of vs, etc.) in their applicability, ! which we ignore for simplicity now. @@ -114,7 +114,7 @@ module model_emc_par ! (see also Qin et al. 2009, Reliability of mantle tomography models assessed by spectral element simulation, sec. 5.2) !double precision, parameter :: SCALE_RHO = 0.40d0 ! SCEC CVM-H version model relationship: https://strike.scec.org/scecpedia/CVM-H, - ! uses Brocher 2005 (Empirical Relations between Elastic Wavespeeds and Density in the Earth’s Crust, BSSA) for sediments + ! uses Brocher 2005 (Empirical Relations between Elastic Wavespeeds and Density in the Earth's Crust, BSSA) for sediments !double precision, parameter :: SCALE_RHO = 0.254d0 ! !! other factors to convert (perturbations in) shear speed to (perturbations in) Vp @@ -631,7 +631,7 @@ subroutine scale_Brocher_rho_from_vp() double precision :: vp,vp_p2,vp_p3,vp_p4,vp_p5,rho integer :: ix,iy,iz,nx,ny,nz - ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth’s Crust, BSSA + ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth's Crust, BSSA ! factors from eq. (1) double precision,parameter :: fac1 = 1.6612d0 double precision,parameter :: fac2 = -0.4721d0 @@ -692,7 +692,7 @@ subroutine scale_Brocher_vs_from_vp() double precision :: vp,vp_p2,vp_p3,vp_p4,vs integer :: ix,iy,iz,nx,ny,nz - ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth’s Crust, BSSA + ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth's Crust, BSSA ! factors from eq. (1) double precision,parameter :: fac1 = 0.7858d0 double precision,parameter :: fac2 = -1.2344d0 @@ -763,7 +763,7 @@ subroutine scale_Brocher_vp_from_vs() double precision :: vs,vs_p2,vs_p3,vs_p4,vp integer :: ix,iy,iz,nx,ny,nz - ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth’s Crust, BSSA + ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth's Crust, BSSA ! factors from eq. (1) double precision,parameter :: fac1 = 0.9409d0 double precision,parameter :: fac2 = 2.0947d0 diff --git a/src/meshfem3D/setup_inner_outer.f90 b/src/meshfem3D/setup_inner_outer.f90 index f8cf49a97..d52dd7528 100644 --- a/src/meshfem3D/setup_inner_outer.f90 +++ b/src/meshfem3D/setup_inner_outer.f90 @@ -30,7 +30,8 @@ subroutine setup_inner_outer(iregion_code) use meshfem_par, only: & myrank,OUTPUT_FILES,IMAIN, & - IREGION_CRUST_MANTLE,IREGION_OUTER_CORE,IREGION_INNER_CORE,MAX_STRING_LEN + IREGION_CRUST_MANTLE,IREGION_OUTER_CORE,IREGION_INNER_CORE,MAX_STRING_LEN, & + NPROCTOT use meshfem_par, only: ibool,is_on_a_slice_edge,xstore_glob,ystore_glob,zstore_glob @@ -38,9 +39,6 @@ subroutine setup_inner_outer(iregion_code) use MPI_outer_core_par use MPI_inner_core_par -!debug -! use shared_parameters, only: NPROCTOT - implicit none integer,intent(in) :: iregion_code @@ -59,7 +57,11 @@ subroutine setup_inner_outer(iregion_code) select case (iregion_code) case (IREGION_CRUST_MANTLE) ! crust_mantle - nspec_outer_crust_mantle = count( is_on_a_slice_edge ) + if (NPROCTOT > 1) then + nspec_outer_crust_mantle = count( is_on_a_slice_edge ) + else + nspec_outer_crust_mantle = 0 + endif nspec_inner_crust_mantle = NSPEC_CRUST_MANTLE - nspec_outer_crust_mantle num_phase_ispec_crust_mantle = max(nspec_inner_crust_mantle,nspec_outer_crust_mantle) @@ -71,7 +73,7 @@ subroutine setup_inner_outer(iregion_code) iinner = 0 iouter = 0 do ispec = 1,NSPEC_CRUST_MANTLE - if (is_on_a_slice_edge(ispec)) then + if (is_on_a_slice_edge(ispec) .and. NPROCTOT > 1) then ! outer element iouter = iouter + 1 phase_ispec_inner_crust_mantle(iouter,1) = ispec @@ -113,7 +115,11 @@ subroutine setup_inner_outer(iregion_code) case (IREGION_OUTER_CORE) ! outer_core - nspec_outer_outer_core = count( is_on_a_slice_edge ) + if (NPROCTOT > 1) then + nspec_outer_outer_core = count( is_on_a_slice_edge ) + else + nspec_outer_outer_core = 0 + endif nspec_inner_outer_core = NSPEC_OUTER_CORE - nspec_outer_outer_core num_phase_ispec_outer_core = max(nspec_inner_outer_core,nspec_outer_outer_core) @@ -125,7 +131,7 @@ subroutine setup_inner_outer(iregion_code) iinner = 0 iouter = 0 do ispec = 1,NSPEC_OUTER_CORE - if (is_on_a_slice_edge(ispec)) then + if (is_on_a_slice_edge(ispec) .and. NPROCTOT > 1) then ! outer element iouter = iouter + 1 phase_ispec_inner_outer_core(iouter,1) = ispec @@ -154,7 +160,11 @@ subroutine setup_inner_outer(iregion_code) case (IREGION_INNER_CORE) ! inner_core - nspec_outer_inner_core = count( is_on_a_slice_edge ) + if (NPROCTOT > 1) then + nspec_outer_inner_core = count( is_on_a_slice_edge ) + else + nspec_outer_inner_core = 0 + endif nspec_inner_inner_core = NSPEC_INNER_CORE - nspec_outer_inner_core num_phase_ispec_inner_core = max(nspec_inner_inner_core,nspec_outer_inner_core) @@ -166,7 +176,7 @@ subroutine setup_inner_outer(iregion_code) iinner = 0 iouter = 0 do ispec = 1,NSPEC_INNER_CORE - if (is_on_a_slice_edge(ispec)) then + if (is_on_a_slice_edge(ispec) .and. NPROCTOT > 1) then ! outer element iouter = iouter + 1 phase_ispec_inner_inner_core(iouter,1) = ispec diff --git a/src/meshfem3D/write_AVS_DX_global_faces_data.f90 b/src/meshfem3D/write_AVS_DX_global_faces_data.f90 index d32269db0..44a51cd85 100644 --- a/src/meshfem3D/write_AVS_DX_global_faces_data.f90 +++ b/src/meshfem3D/write_AVS_DX_global_faces_data.f90 @@ -90,364 +90,366 @@ subroutine write_AVS_DX_global_faces_data(prname,nspec,iMPIcut_xi,iMPIcut_eta, & ! processor identification character(len=MAX_STRING_LEN) :: prname -! writing points + ! writing points open(unit=IOUT,file=prname(1:len_trim(prname))//'AVS_DXpointsfaces.txt',status='unknown') -! erase the logical mask used to mark points already found + ! erase the logical mask used to mark points already found mask_ibool(:) = .false. nspecface = 0 -! mark global AVS or DX points + ! mark global AVS or DX points do ispec = 1,nspec -! only if on face - if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. & - iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then - iglob1=ibool(1,1,1,ispec) - iglob2=ibool(NGLLX,1,1,ispec) - iglob3=ibool(NGLLX,NGLLY,1,ispec) - iglob4=ibool(1,NGLLY,1,ispec) - iglob5=ibool(1,1,NGLLZ,ispec) - iglob6=ibool(NGLLX,1,NGLLZ,ispec) - iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec) - iglob8=ibool(1,NGLLY,NGLLZ,ispec) - -! face xi = xi_min - if (iMPIcut_xi(1,ispec)) then - nspecface = nspecface + 1 - mask_ibool(iglob1) = .true. - mask_ibool(iglob4) = .true. - mask_ibool(iglob8) = .true. - mask_ibool(iglob5) = .true. - endif - -! face xi = xi_max - if (iMPIcut_xi(2,ispec)) then - nspecface = nspecface + 1 - mask_ibool(iglob2) = .true. - mask_ibool(iglob3) = .true. - mask_ibool(iglob7) = .true. - mask_ibool(iglob6) = .true. - endif - -! face eta = eta_min - if (iMPIcut_eta(1,ispec)) then - nspecface = nspecface + 1 - mask_ibool(iglob1) = .true. - mask_ibool(iglob2) = .true. - mask_ibool(iglob6) = .true. - mask_ibool(iglob5) = .true. - endif - -! face eta = eta_max - if (iMPIcut_eta(2,ispec)) then - nspecface = nspecface + 1 - mask_ibool(iglob4) = .true. - mask_ibool(iglob3) = .true. - mask_ibool(iglob7) = .true. - mask_ibool(iglob8) = .true. - endif - - endif + ! only if on face + if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. & + iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then + iglob1=ibool(1,1,1,ispec) + iglob2=ibool(NGLLX,1,1,ispec) + iglob3=ibool(NGLLX,NGLLY,1,ispec) + iglob4=ibool(1,NGLLY,1,ispec) + iglob5=ibool(1,1,NGLLZ,ispec) + iglob6=ibool(NGLLX,1,NGLLZ,ispec) + iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec) + iglob8=ibool(1,NGLLY,NGLLZ,ispec) + + ! face xi = xi_min + if (iMPIcut_xi(1,ispec)) then + nspecface = nspecface + 1 + mask_ibool(iglob1) = .true. + mask_ibool(iglob4) = .true. + mask_ibool(iglob8) = .true. + mask_ibool(iglob5) = .true. + endif + + ! face xi = xi_max + if (iMPIcut_xi(2,ispec)) then + nspecface = nspecface + 1 + mask_ibool(iglob2) = .true. + mask_ibool(iglob3) = .true. + mask_ibool(iglob7) = .true. + mask_ibool(iglob6) = .true. + endif + + ! face eta = eta_min + if (iMPIcut_eta(1,ispec)) then + nspecface = nspecface + 1 + mask_ibool(iglob1) = .true. + mask_ibool(iglob2) = .true. + mask_ibool(iglob6) = .true. + mask_ibool(iglob5) = .true. + endif + + ! face eta = eta_max + if (iMPIcut_eta(2,ispec)) then + nspecface = nspecface + 1 + mask_ibool(iglob4) = .true. + mask_ibool(iglob3) = .true. + mask_ibool(iglob7) = .true. + mask_ibool(iglob8) = .true. + endif + + endif enddo -! count global number of AVS or DX points + ! count global number of AVS or DX points npoin = count(mask_ibool(:)) -! number of points in AVS or DX file + ! number of points in AVS or DX file write(IOUT,*) npoin -! erase the logical mask used to mark points already found + ! erase the logical mask used to mark points already found mask_ibool(:) = .false. -! output global AVS or DX points + ! output global AVS or DX points numpoin = 0 do ispec = 1,nspec -! only if on face - if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. & - iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then - iglob1=ibool(1,1,1,ispec) - iglob2=ibool(NGLLX,1,1,ispec) - iglob3=ibool(NGLLX,NGLLY,1,ispec) - iglob4=ibool(1,NGLLY,1,ispec) - iglob5=ibool(1,1,NGLLZ,ispec) - iglob6=ibool(NGLLX,1,NGLLZ,ispec) - iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec) - iglob8=ibool(1,NGLLY,NGLLZ,ispec) - -! face xi = xi_min - if (iMPIcut_xi(1,ispec)) then - if (.not. mask_ibool(iglob1)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob1) = numpoin - write(IOUT,*) numpoin,sngl(xstore(1,1,1,ispec)), & - sngl(ystore(1,1,1,ispec)),sngl(zstore(1,1,1,ispec)) - endif - if (.not. mask_ibool(iglob4)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob4) = numpoin - write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,1,ispec)), & - sngl(ystore(1,NGLLY,1,ispec)),sngl(zstore(1,NGLLY,1,ispec)) - endif - if (.not. mask_ibool(iglob8)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob8) = numpoin - write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), & - sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec)) - endif - if (.not. mask_ibool(iglob5)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob5) = numpoin - write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), & - sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec)) - endif - mask_ibool(iglob1) = .true. - mask_ibool(iglob4) = .true. - mask_ibool(iglob8) = .true. - mask_ibool(iglob5) = .true. - endif - -! face xi = xi_max - if (iMPIcut_xi(2,ispec)) then - if (.not. mask_ibool(iglob2)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob2) = numpoin - write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,1,ispec)), & - sngl(ystore(NGLLX,1,1,ispec)),sngl(zstore(NGLLX,1,1,ispec)) - endif - if (.not. mask_ibool(iglob3)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob3) = numpoin - write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,1,ispec)), & - sngl(ystore(NGLLX,NGLLY,1,ispec)),sngl(zstore(NGLLX,NGLLY,1,ispec)) - endif - if (.not. mask_ibool(iglob7)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob7) = numpoin - write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), & - sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec)) - endif - if (.not. mask_ibool(iglob6)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob6) = numpoin - write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), & - sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec)) - endif - mask_ibool(iglob2) = .true. - mask_ibool(iglob3) = .true. - mask_ibool(iglob7) = .true. - mask_ibool(iglob6) = .true. - endif - -! face eta = eta_min - if (iMPIcut_eta(1,ispec)) then - if (.not. mask_ibool(iglob1)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob1) = numpoin - write(IOUT,*) numpoin,sngl(xstore(1,1,1,ispec)), & - sngl(ystore(1,1,1,ispec)),sngl(zstore(1,1,1,ispec)) - endif - if (.not. mask_ibool(iglob2)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob2) = numpoin - write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,1,ispec)), & - sngl(ystore(NGLLX,1,1,ispec)),sngl(zstore(NGLLX,1,1,ispec)) - endif - if (.not. mask_ibool(iglob6)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob6) = numpoin - write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), & - sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec)) - endif - if (.not. mask_ibool(iglob5)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob5) = numpoin - write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), & - sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec)) - endif - mask_ibool(iglob1) = .true. - mask_ibool(iglob2) = .true. - mask_ibool(iglob6) = .true. - mask_ibool(iglob5) = .true. - endif - -! face eta = eta_max - if (iMPIcut_eta(2,ispec)) then - if (.not. mask_ibool(iglob4)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob4) = numpoin - write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,1,ispec)), & - sngl(ystore(1,NGLLY,1,ispec)),sngl(zstore(1,NGLLY,1,ispec)) - endif - if (.not. mask_ibool(iglob3)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob3) = numpoin - write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,1,ispec)), & - sngl(ystore(NGLLX,NGLLY,1,ispec)),sngl(zstore(NGLLX,NGLLY,1,ispec)) - endif - if (.not. mask_ibool(iglob7)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob7) = numpoin - write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), & - sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec)) - endif - if (.not. mask_ibool(iglob8)) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglob8) = numpoin - write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), & - sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec)) - endif - mask_ibool(iglob4) = .true. - mask_ibool(iglob3) = .true. - mask_ibool(iglob7) = .true. - mask_ibool(iglob8) = .true. - endif + ! only if on face + if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. & + iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then + iglob1=ibool(1,1,1,ispec) + iglob2=ibool(NGLLX,1,1,ispec) + iglob3=ibool(NGLLX,NGLLY,1,ispec) + iglob4=ibool(1,NGLLY,1,ispec) + iglob5=ibool(1,1,NGLLZ,ispec) + iglob6=ibool(NGLLX,1,NGLLZ,ispec) + iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec) + iglob8=ibool(1,NGLLY,NGLLZ,ispec) + + ! face xi = xi_min + if (iMPIcut_xi(1,ispec)) then + if (.not. mask_ibool(iglob1)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob1) = numpoin + write(IOUT,*) numpoin,sngl(xstore(1,1,1,ispec)), & + sngl(ystore(1,1,1,ispec)),sngl(zstore(1,1,1,ispec)) + endif + if (.not. mask_ibool(iglob4)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob4) = numpoin + write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,1,ispec)), & + sngl(ystore(1,NGLLY,1,ispec)),sngl(zstore(1,NGLLY,1,ispec)) + endif + if (.not. mask_ibool(iglob8)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob8) = numpoin + write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), & + sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec)) + endif + if (.not. mask_ibool(iglob5)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob5) = numpoin + write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), & + sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec)) + endif + mask_ibool(iglob1) = .true. + mask_ibool(iglob4) = .true. + mask_ibool(iglob8) = .true. + mask_ibool(iglob5) = .true. + endif + + ! face xi = xi_max + if (iMPIcut_xi(2,ispec)) then + if (.not. mask_ibool(iglob2)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob2) = numpoin + write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,1,ispec)), & + sngl(ystore(NGLLX,1,1,ispec)),sngl(zstore(NGLLX,1,1,ispec)) + endif + if (.not. mask_ibool(iglob3)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob3) = numpoin + write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,1,ispec)), & + sngl(ystore(NGLLX,NGLLY,1,ispec)),sngl(zstore(NGLLX,NGLLY,1,ispec)) + endif + if (.not. mask_ibool(iglob7)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob7) = numpoin + write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), & + sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec)) + endif + if (.not. mask_ibool(iglob6)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob6) = numpoin + write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), & + sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec)) + endif + mask_ibool(iglob2) = .true. + mask_ibool(iglob3) = .true. + mask_ibool(iglob7) = .true. + mask_ibool(iglob6) = .true. + endif + + ! face eta = eta_min + if (iMPIcut_eta(1,ispec)) then + if (.not. mask_ibool(iglob1)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob1) = numpoin + write(IOUT,*) numpoin,sngl(xstore(1,1,1,ispec)), & + sngl(ystore(1,1,1,ispec)),sngl(zstore(1,1,1,ispec)) + endif + if (.not. mask_ibool(iglob2)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob2) = numpoin + write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,1,ispec)), & + sngl(ystore(NGLLX,1,1,ispec)),sngl(zstore(NGLLX,1,1,ispec)) + endif + if (.not. mask_ibool(iglob6)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob6) = numpoin + write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), & + sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec)) + endif + if (.not. mask_ibool(iglob5)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob5) = numpoin + write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), & + sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec)) + endif + mask_ibool(iglob1) = .true. + mask_ibool(iglob2) = .true. + mask_ibool(iglob6) = .true. + mask_ibool(iglob5) = .true. + endif + + ! face eta = eta_max + if (iMPIcut_eta(2,ispec)) then + if (.not. mask_ibool(iglob4)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob4) = numpoin + write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,1,ispec)), & + sngl(ystore(1,NGLLY,1,ispec)),sngl(zstore(1,NGLLY,1,ispec)) + endif + if (.not. mask_ibool(iglob3)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob3) = numpoin + write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,1,ispec)), & + sngl(ystore(NGLLX,NGLLY,1,ispec)),sngl(zstore(NGLLX,NGLLY,1,ispec)) + endif + if (.not. mask_ibool(iglob7)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob7) = numpoin + write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), & + sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec)) + endif + if (.not. mask_ibool(iglob8)) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglob8) = numpoin + write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), & + sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec)) + endif + mask_ibool(iglob4) = .true. + mask_ibool(iglob3) = .true. + mask_ibool(iglob7) = .true. + mask_ibool(iglob8) = .true. + endif - endif + endif enddo -! check that number of global points output is okay + ! check that number of global points output is okay if (numpoin /= npoin) & call exit_MPI(myrank,'incorrect number of global points in AVS or DX file creation') close(IOUT) -! output global AVS or DX elements + ! output global AVS or DX elements -! writing elements + ! writing elements open(unit=IOUT,file=prname(1:len_trim(prname))//'AVS_DXelementsfaces.txt',status='unknown') - if (MODEL_3D_MANTLE_PERTUBATIONS) & + if (MODEL_3D_MANTLE_PERTUBATIONS) & open(unit=11,file=prname(1:len_trim(prname))//'AVS_DXelementsfaces_dvp_dvs.txt',status='unknown') -! number of elements in AVS or DX file + ! number of elements in AVS or DX file write(IOUT,*) nspecface ispecface = 0 do ispec = 1,nspec -! only if on face - if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. & - iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then - iglob1=ibool(1,1,1,ispec) - iglob2=ibool(NGLLX,1,1,ispec) - iglob3=ibool(NGLLX,NGLLY,1,ispec) - iglob4=ibool(1,NGLLY,1,ispec) - iglob5=ibool(1,1,NGLLZ,ispec) - iglob6=ibool(NGLLX,1,NGLLZ,ispec) - iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec) - iglob8=ibool(1,NGLLY,NGLLZ,ispec) - -! include lateral variations if needed - - if (MODEL_3D_MANTLE_PERTUBATIONS) then - ! pick a point within the element and get its radius - r=dsqrt(xstore(2,2,2,ispec)**2+ystore(2,2,2,ispec)**2+zstore(2,2,2,ispec)**2) - - if (r > RCMB/R_PLANET .and. r < R_UNIT_SPHERE) then - ! average over the element - dvp = 0.0 - dvs = 0.0 - np = 0 - do k=2,NGLLZ-1 - do j=2,NGLLY-1 - do i=2,NGLLX-1 - np=np+1 - x=xstore(i,j,k,ispec) - y=ystore(i,j,k,ispec) - z=zstore(i,j,k,ispec) - r=dsqrt(x*x+y*y+z*z) - ! take out ellipticity - if (ELLIPTICITY) then - call xyz_2_rthetaphi_dble(x,y,z,r,theta,phi_dummy) - cost=dcos(theta) -! this is the Legendre polynomial of degree two, P2(cos(theta)), see the discussion above eq (14.4) in Dahlen and Tromp (1998) - p20=0.5d0*(3.0d0*cost*cost-1.0d0) -! get ellipticity using spline evaluation - call spline_evaluation(rspl,ellipicity_spline,ellipicity_spline2,nspl,r,ell) -! this is eq (14.4) in Dahlen and Tromp (1998) - factor=ONE-(TWO/3.0d0)*ell*p20 - r=r/factor - endif - - - ! gets reference model values: rho,vpv,vph,vsv,vsh and eta_aniso - call meshfem3D_models_get1D_val(iregion_code,idoubling(ispec), & - r,rho,vpv,vph,vsv,vsh,eta_aniso, & - Qkappa,Qmu,RICB,RCMB, & - RTOPDDOUBLEPRIME,R80,R120,R220,R400,R670,R771, & - RMOHO,RMIDDLE_CRUST) - - ! calculates isotropic values - vp = sqrt(((8.d0+4.d0*eta_aniso)*vph*vph + 3.d0*vpv*vpv & - + (8.d0 - 8.d0*eta_aniso)*vsv*vsv)/15.d0) - vs = sqrt(((1.d0-2.d0*eta_aniso)*vph*vph + vpv*vpv & - + 5.d0*vsh*vsh + (6.d0+4.d0*eta_aniso)*vsv*vsv)/15.d0) - - if (abs(rhostore(i,j,k,ispec)) < 1.e-20) then - print *,' attention: rhostore close to zero',rhostore(i,j,k,ispec),r,i,j,k,ispec - dvp = 0.0 - dvs = 0.0 - else if (abs(sngl(vp)) < 1.e-20) then - print *,' attention: vp close to zero',sngl(vp),r,i,j,k,ispec - dvp = 0.0 - else if (abs(sngl(vs)) < 1.e-20) then - print *,' attention: vs close to zero',sngl(vs),r,i,j,k,ispec - dvs = 0.0 - else - dvp = dvp + (sqrt((kappavstore(i,j,k,ispec)+4.*muvstore(i,j,k,ispec)/3.)/rhostore(i,j,k,ispec)) - sngl(vp))/sngl(vp) - dvs = dvs + (sqrt(muvstore(i,j,k,ispec)/rhostore(i,j,k,ispec)) - sngl(vs))/sngl(vs) - endif - - enddo - enddo - enddo - dvp = dvp / np - dvs = dvs / np - else - dvp = 0.0 - dvs = 0.0 + ! only if on face + if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. & + iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then + iglob1=ibool(1,1,1,ispec) + iglob2=ibool(NGLLX,1,1,ispec) + iglob3=ibool(NGLLX,NGLLY,1,ispec) + iglob4=ibool(1,NGLLY,1,ispec) + iglob5=ibool(1,1,NGLLZ,ispec) + iglob6=ibool(NGLLX,1,NGLLZ,ispec) + iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec) + iglob8=ibool(1,NGLLY,NGLLZ,ispec) + + ! include lateral variations if needed + + if (MODEL_3D_MANTLE_PERTUBATIONS) then + ! pick a point within the element and get its radius + r=dsqrt(xstore(2,2,2,ispec)**2+ystore(2,2,2,ispec)**2+zstore(2,2,2,ispec)**2) + + if (r > RCMB/R_PLANET .and. r < R_UNIT_SPHERE) then + ! average over the element + dvp = 0.0 + dvs = 0.0 + np = 0 + do k=2,NGLLZ-1 + do j=2,NGLLY-1 + do i=2,NGLLX-1 + np=np+1 + x=xstore(i,j,k,ispec) + y=ystore(i,j,k,ispec) + z=zstore(i,j,k,ispec) + r=dsqrt(x*x+y*y+z*z) + ! take out ellipticity + if (ELLIPTICITY) then + call xyz_2_rthetaphi_dble(x,y,z,r,theta,phi_dummy) + cost=dcos(theta) + ! this is the Legendre polynomial of degree two, P2(cos(theta)), + ! see the discussion above eq (14.4) in Dahlen and Tromp (1998) + p20=0.5d0*(3.0d0*cost*cost-1.0d0) + ! get ellipticity using spline evaluation + call spline_evaluation(rspl,ellipicity_spline,ellipicity_spline2,nspl,r,ell) + ! this is eq (14.4) in Dahlen and Tromp (1998) + factor=ONE-(TWO/3.0d0)*ell*p20 + r=r/factor + endif + + ! gets reference model values: rho,vpv,vph,vsv,vsh and eta_aniso + call meshfem3D_models_get1D_val(iregion_code,idoubling(ispec), & + r,rho,vpv,vph,vsv,vsh,eta_aniso, & + Qkappa,Qmu,RICB,RCMB, & + RTOPDDOUBLEPRIME,R80,R120,R220,R400,R670,R771, & + RMOHO,RMIDDLE_CRUST) + + ! calculates isotropic values + vp = sqrt(((8.d0+4.d0*eta_aniso)*vph*vph + 3.d0*vpv*vpv & + + (8.d0 - 8.d0*eta_aniso)*vsv*vsv)/15.d0) + vs = sqrt(((1.d0-2.d0*eta_aniso)*vph*vph + vpv*vpv & + + 5.d0*vsh*vsh + (6.d0+4.d0*eta_aniso)*vsv*vsv)/15.d0) + + if (abs(rhostore(i,j,k,ispec)) < 1.e-20) then + print *,' attention: rhostore close to zero',rhostore(i,j,k,ispec),r,i,j,k,ispec + dvp = 0.0 + dvs = 0.0 + else if (abs(sngl(vp)) < 1.e-20) then + print *,' attention: vp close to zero',sngl(vp),r,i,j,k,ispec + dvp = 0.0 + else if (abs(sngl(vs)) < 1.e-20) then + print *,' attention: vs close to zero',sngl(vs),r,i,j,k,ispec + dvs = 0.0 + else + dvp = dvp & + + (sqrt((kappavstore(i,j,k,ispec)+4.*muvstore(i,j,k,ispec)/3.)/rhostore(i,j,k,ispec)) - sngl(vp))/sngl(vp) + dvs = dvs & + + (sqrt(muvstore(i,j,k,ispec)/rhostore(i,j,k,ispec)) - sngl(vs))/sngl(vs) + endif + + enddo + enddo + enddo + dvp = dvp / np + dvs = dvs / np + else + dvp = 0.0 + dvs = 0.0 + endif + endif + + ! face xi = xi_min + if (iMPIcut_xi(1,ispec)) then + ispecface = ispecface + 1 + write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob1), & + num_ibool_AVS_DX(iglob4),num_ibool_AVS_DX(iglob8), & + num_ibool_AVS_DX(iglob5) + if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs + endif + + ! face xi = xi_max + if (iMPIcut_xi(2,ispec)) then + ispecface = ispecface + 1 + write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob2), & + num_ibool_AVS_DX(iglob3),num_ibool_AVS_DX(iglob7), & + num_ibool_AVS_DX(iglob6) + if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs + endif + + ! face eta = eta_min + if (iMPIcut_eta(1,ispec)) then + ispecface = ispecface + 1 + write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob1), & + num_ibool_AVS_DX(iglob2),num_ibool_AVS_DX(iglob6), & + num_ibool_AVS_DX(iglob5) + if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs + endif + + ! face eta = eta_max + if (iMPIcut_eta(2,ispec)) then + ispecface = ispecface + 1 + write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob4), & + num_ibool_AVS_DX(iglob3),num_ibool_AVS_DX(iglob7), & + num_ibool_AVS_DX(iglob8) + if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs + endif + endif - endif - -! face xi = xi_min - if (iMPIcut_xi(1,ispec)) then - ispecface = ispecface + 1 - write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob1), & - num_ibool_AVS_DX(iglob4),num_ibool_AVS_DX(iglob8), & - num_ibool_AVS_DX(iglob5) - if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs - endif - -! face xi = xi_max - if (iMPIcut_xi(2,ispec)) then - ispecface = ispecface + 1 - write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob2), & - num_ibool_AVS_DX(iglob3),num_ibool_AVS_DX(iglob7), & - num_ibool_AVS_DX(iglob6) - if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs - endif - -! face eta = eta_min - if (iMPIcut_eta(1,ispec)) then - ispecface = ispecface + 1 - write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob1), & - num_ibool_AVS_DX(iglob2),num_ibool_AVS_DX(iglob6), & - num_ibool_AVS_DX(iglob5) - if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs - endif - -! face eta = eta_max - if (iMPIcut_eta(2,ispec)) then - ispecface = ispecface + 1 - write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob4), & - num_ibool_AVS_DX(iglob3),num_ibool_AVS_DX(iglob7), & - num_ibool_AVS_DX(iglob8) - if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs - endif - - endif enddo -! check that number of surface elements output is okay + ! check that number of surface elements output is okay if (ispecface /= nspecface) & call exit_MPI(myrank,'incorrect number of surface elements in AVS or DX file creation') diff --git a/src/meshfem3D/write_AVS_DX_surface_data.f90 b/src/meshfem3D/write_AVS_DX_surface_data.f90 index 27ba5dc14..ff9ddca76 100644 --- a/src/meshfem3D/write_AVS_DX_surface_data.f90 +++ b/src/meshfem3D/write_AVS_DX_surface_data.f90 @@ -64,11 +64,11 @@ subroutine write_AVS_DX_surface_data(prname,nspec,iboun, & real(kind=CUSTOM_REAL) :: muvstore(NGLLX,NGLLY,NGLLZ,nspec) real(kind=CUSTOM_REAL) :: rhostore(NGLLX,NGLLY,NGLLZ,nspec) -! logical mask used to output global points only once + ! logical mask used to output global points only once integer :: npointot logical :: mask_ibool(npointot) -! numbering of global AVS or DX points + ! numbering of global AVS or DX points integer :: num_ibool_AVS_DX(npointot) integer :: ispec @@ -76,212 +76,212 @@ subroutine write_AVS_DX_surface_data(prname,nspec,iboun, & integer, dimension(8) :: iglobval integer :: npoin,numpoin,nspecface,ispecface -! for ellipticity + ! for ellipticity integer :: nspl double precision :: rspl(NR_DENSITY),ellipicity_spline(NR_DENSITY),ellipicity_spline2(NR_DENSITY) -! processor identification + ! processor identification character(len=MAX_STRING_LEN) :: prname integer :: iregion_code -! writing points + ! writing points open(unit=IOUT,file=prname(1:len_trim(prname))//'AVS_DXpointssurface.txt',status='unknown') -! erase the logical mask used to mark points already found + ! erase the logical mask used to mark points already found mask_ibool(:) = .false. nspecface = 0 -! mark global AVS or DX points + ! mark global AVS or DX points do ispec = 1,nspec -! only if at the surface (top plane) - if (iboun(6,ispec)) then - - iglobval(5)=ibool(1,1,NGLLZ,ispec) - iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec) - iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec) - iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec) - -! element is at the surface - nspecface = nspecface + 1 - mask_ibool(iglobval(5)) = .true. - mask_ibool(iglobval(6)) = .true. - mask_ibool(iglobval(7)) = .true. - mask_ibool(iglobval(8)) = .true. - - endif + ! only if at the surface (top plane) + if (iboun(6,ispec)) then + + iglobval(5)=ibool(1,1,NGLLZ,ispec) + iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec) + iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec) + iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec) + + ! element is at the surface + nspecface = nspecface + 1 + mask_ibool(iglobval(5)) = .true. + mask_ibool(iglobval(6)) = .true. + mask_ibool(iglobval(7)) = .true. + mask_ibool(iglobval(8)) = .true. + + endif enddo -! count global number of AVS or DX points + ! count global number of AVS or DX points npoin = count(mask_ibool(:)) -! number of points in AVS or DX file + ! number of points in AVS or DX file write(IOUT,*) npoin -! erase the logical mask used to mark points already found + ! erase the logical mask used to mark points already found mask_ibool(:) = .false. -! output global AVS or DX points + ! output global AVS or DX points numpoin = 0 do ispec = 1,nspec -! only if at the surface - if (iboun(6,ispec)) then - - iglobval(5)=ibool(1,1,NGLLZ,ispec) - iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec) - iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec) - iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec) - -! top face - if (iboun(6,ispec)) then - - if (.not. mask_ibool(iglobval(5))) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglobval(5)) = numpoin - write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), & - sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec)) - endif + ! only if at the surface + if (iboun(6,ispec)) then + + iglobval(5)=ibool(1,1,NGLLZ,ispec) + iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec) + iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec) + iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec) + + ! top face + if (iboun(6,ispec)) then + + if (.not. mask_ibool(iglobval(5))) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglobval(5)) = numpoin + write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), & + sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec)) + endif - if (.not. mask_ibool(iglobval(6))) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglobval(6)) = numpoin - write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), & - sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec)) - endif + if (.not. mask_ibool(iglobval(6))) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglobval(6)) = numpoin + write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), & + sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec)) + endif - if (.not. mask_ibool(iglobval(7))) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglobval(7)) = numpoin - write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), & - sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec)) - endif + if (.not. mask_ibool(iglobval(7))) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglobval(7)) = numpoin + write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), & + sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec)) + endif - if (.not. mask_ibool(iglobval(8))) then - numpoin = numpoin + 1 - num_ibool_AVS_DX(iglobval(8)) = numpoin - write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), & - sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec)) - endif + if (.not. mask_ibool(iglobval(8))) then + numpoin = numpoin + 1 + num_ibool_AVS_DX(iglobval(8)) = numpoin + write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), & + sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec)) + endif - mask_ibool(iglobval(5)) = .true. - mask_ibool(iglobval(6)) = .true. - mask_ibool(iglobval(7)) = .true. - mask_ibool(iglobval(8)) = .true. + mask_ibool(iglobval(5)) = .true. + mask_ibool(iglobval(6)) = .true. + mask_ibool(iglobval(7)) = .true. + mask_ibool(iglobval(8)) = .true. - endif + endif - endif + endif enddo -! check that number of global points output is okay + ! check that number of global points output is okay if (numpoin /= npoin) & call exit_MPI(myrank,'incorrect number of global points in AVS or DX file creation') close(IOUT) -! output global AVS or DX elements + ! output global AVS or DX elements -! writing elements + ! writing elements open(unit=IOUT,file=prname(1:len_trim(prname))//'AVS_DXelementssurface.txt',status='unknown') if (MODEL_3D_MANTLE_PERTUBATIONS) & open(unit=11,file=prname(1:len_trim(prname))//'AVS_DXelementssurface_dvp_dvs.txt',status='unknown') -! number of elements in AVS or DX file + ! number of elements in AVS or DX file write(IOUT,*) nspecface ispecface = 0 do ispec = 1,nspec -! only if at the surface - if (iboun(6,ispec)) then - - iglobval(5)=ibool(1,1,NGLLZ,ispec) - iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec) - iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec) - iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec) - - if (MODEL_3D_MANTLE_PERTUBATIONS) then - ! pick a point within the element and get its radius - r=dsqrt(xstore(2,2,2,ispec)**2+ystore(2,2,2,ispec)**2+zstore(2,2,2,ispec)**2) - - if (r > RCMB/R_PLANET .and. r < R_UNIT_SPHERE) then - ! average over the element - dvp = 0.0 - dvs = 0.0 - np = 0 - do k=2,NGLLZ-1 - do j=2,NGLLY-1 - do i=2,NGLLX-1 - np=np+1 - x=xstore(i,j,k,ispec) - y=ystore(i,j,k,ispec) - z=zstore(i,j,k,ispec) - r=dsqrt(x*x+y*y+z*z) - ! take out ellipticity - if (ELLIPTICITY) then - call xyz_2_rthetaphi_dble(x,y,z,r,theta,phi_dummy) - cost=dcos(theta) -! this is the Legendre polynomial of degree two, P2(cos(theta)), see the discussion above eq (14.4) in Dahlen and Tromp (1998) - p20=0.5d0*(3.0d0*cost*cost-1.0d0) -! get ellipticity using spline evaluation - call spline_evaluation(rspl,ellipicity_spline,ellipicity_spline2,nspl,r,ell) -! this is eq (14.4) in Dahlen and Tromp (1998) - factor=ONE-(TWO/3.0d0)*ell*p20 - r=r/factor - endif - - - ! gets reference model values: rho,vpv,vph,vsv,vsh and eta_aniso - call meshfem3D_models_get1D_val(iregion_code,idoubling(ispec), & - r,rho,vpv,vph,vsv,vsh,eta_aniso, & - Qkappa,Qmu,RICB,RCMB, & - RTOPDDOUBLEPRIME,R80,R120,R220,R400,R670,R771, & - RMOHO,RMIDDLE_CRUST) - - ! calculates isotropic values - vp = sqrt(((8.d0+4.d0*eta_aniso)*vph*vph + 3.d0*vpv*vpv & - + (8.d0 - 8.d0*eta_aniso)*vsv*vsv)/15.d0) - vs = sqrt(((1.d0-2.d0*eta_aniso)*vph*vph + vpv*vpv & - + 5.d0*vsh*vsh + (6.d0+4.d0*eta_aniso)*vsv*vsv)/15.d0) - - if (abs(rhostore(i,j,k,ispec)) < 1.e-20) then - print *,' attention: rhostore close to zero',rhostore(i,j,k,ispec),r,i,j,k,ispec - dvp = 0.0 - dvs = 0.0 - else if (abs(sngl(vp)) < 1.e-20) then - print *,' attention: vp close to zero',sngl(vp),r,i,j,k,ispec - dvp = 0.0 - else if (abs(sngl(vs)) < 1.e-20) then - print *,' attention: vs close to zero',sngl(vs),r,i,j,k,ispec - dvs = 0.0 - else - dvp = dvp + (sqrt((kappavstore(i,j,k,ispec)+4.*muvstore(i,j,k,ispec)/3.) & - /rhostore(i,j,k,ispec)) - sngl(vp))/sngl(vp) - dvs = dvs + (sqrt(muvstore(i,j,k,ispec)/rhostore(i,j,k,ispec)) - sngl(vs))/sngl(vs) - endif - - enddo - enddo + ! only if at the surface + if (iboun(6,ispec)) then + + iglobval(5)=ibool(1,1,NGLLZ,ispec) + iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec) + iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec) + iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec) + + if (MODEL_3D_MANTLE_PERTUBATIONS) then + ! pick a point within the element and get its radius + r=dsqrt(xstore(2,2,2,ispec)**2+ystore(2,2,2,ispec)**2+zstore(2,2,2,ispec)**2) + + if (r > RCMB/R_PLANET .and. r < R_UNIT_SPHERE) then + ! average over the element + dvp = 0.0 + dvs = 0.0 + np = 0 + do k=2,NGLLZ-1 + do j=2,NGLLY-1 + do i=2,NGLLX-1 + np=np+1 + x=xstore(i,j,k,ispec) + y=ystore(i,j,k,ispec) + z=zstore(i,j,k,ispec) + r=dsqrt(x*x+y*y+z*z) + ! take out ellipticity + if (ELLIPTICITY) then + call xyz_2_rthetaphi_dble(x,y,z,r,theta,phi_dummy) + cost=dcos(theta) + ! this is the Legendre polynomial of degree two, P2(cos(theta)), + ! see the discussion above eq (14.4) in Dahlen and Tromp (1998) + p20=0.5d0*(3.0d0*cost*cost-1.0d0) + ! get ellipticity using spline evaluation + call spline_evaluation(rspl,ellipicity_spline,ellipicity_spline2,nspl,r,ell) + ! this is eq (14.4) in Dahlen and Tromp (1998) + factor=ONE-(TWO/3.0d0)*ell*p20 + r=r/factor + endif + + ! gets reference model values: rho,vpv,vph,vsv,vsh and eta_aniso + call meshfem3D_models_get1D_val(iregion_code,idoubling(ispec), & + r,rho,vpv,vph,vsv,vsh,eta_aniso, & + Qkappa,Qmu,RICB,RCMB, & + RTOPDDOUBLEPRIME,R80,R120,R220,R400,R670,R771, & + RMOHO,RMIDDLE_CRUST) + + ! calculates isotropic values + vp = sqrt(((8.d0+4.d0*eta_aniso)*vph*vph + 3.d0*vpv*vpv & + + (8.d0 - 8.d0*eta_aniso)*vsv*vsv)/15.d0) + vs = sqrt(((1.d0-2.d0*eta_aniso)*vph*vph + vpv*vpv & + + 5.d0*vsh*vsh + (6.d0+4.d0*eta_aniso)*vsv*vsv)/15.d0) + + if (abs(rhostore(i,j,k,ispec)) < 1.e-20) then + print *,' attention: rhostore close to zero',rhostore(i,j,k,ispec),r,i,j,k,ispec + dvp = 0.0 + dvs = 0.0 + else if (abs(sngl(vp)) < 1.e-20) then + print *,' attention: vp close to zero',sngl(vp),r,i,j,k,ispec + dvp = 0.0 + else if (abs(sngl(vs)) < 1.e-20) then + print *,' attention: vs close to zero',sngl(vs),r,i,j,k,ispec + dvs = 0.0 + else + dvp = dvp + (sqrt((kappavstore(i,j,k,ispec)+4.*muvstore(i,j,k,ispec)/3.) & + /rhostore(i,j,k,ispec)) - sngl(vp))/sngl(vp) + dvs = dvs + (sqrt(muvstore(i,j,k,ispec)/rhostore(i,j,k,ispec)) - sngl(vs))/sngl(vs) + endif + enddo - dvp = dvp / np - dvs = dvs / np - else - dvp = 0.0 - dvs = 0.0 - endif + enddo + enddo + dvp = dvp / np + dvs = dvs / np + else + dvp = 0.0 + dvs = 0.0 endif + endif - ! top face - ispecface = ispecface + 1 - write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglobval(5)), & - num_ibool_AVS_DX(iglobval(6)),num_ibool_AVS_DX(iglobval(7)), & - num_ibool_AVS_DX(iglobval(8)) - if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs + ! top face + ispecface = ispecface + 1 + write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglobval(5)), & + num_ibool_AVS_DX(iglobval(6)),num_ibool_AVS_DX(iglobval(7)), & + num_ibool_AVS_DX(iglobval(8)) + if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs - endif + endif enddo -! check that number of surface elements output is okay + ! check that number of surface elements output is okay if (ispecface /= nspecface) & call exit_MPI(myrank,'incorrect number of surface elements in AVS or DX file creation') diff --git a/src/shared/parallel.f90 b/src/shared/parallel.f90 index c781031a1..a6847d538 100644 --- a/src/shared/parallel.f90 +++ b/src/shared/parallel.f90 @@ -924,8 +924,18 @@ end subroutine max_all_all_cr !------------------------------------------------------------------------------------------------- ! -! subroutine max_all_dp(sendbuf, recvbuf) -! end subroutine max_all_dp + subroutine max_all_dp(sendbuf, recvbuf) + + use my_mpi + + implicit none + + double precision :: sendbuf, recvbuf + integer :: ier + + call MPI_REDUCE(sendbuf,recvbuf,1,MPI_DOUBLE_PRECISION,MPI_MAX,0,my_local_mpi_comm_world,ier) + + end subroutine max_all_dp ! !------------------------------------------------------------------------------------------------- diff --git a/src/shared/read_compute_parameters.f90 b/src/shared/read_compute_parameters.f90 index 3c1edeedc..b13acbaf0 100644 --- a/src/shared/read_compute_parameters.f90 +++ b/src/shared/read_compute_parameters.f90 @@ -219,8 +219,12 @@ subroutine rcp_set_compute_parameters() if (STEADY_STATE_KERNEL) then NSTEP_STEADY_STATE = nint(STEADY_STATE_LENGTH_IN_MINUTES * 60.d0 / DT) + ! checks length if (NSTEP_STEADY_STATE == 0) then + print *, '*****************************************************************' print *, 'Warning: STEADY_STATE_KERNEL disabled because STEADY_STATE_LENGTH_IN_MINUTES is zero' + print *, '*****************************************************************' + STEADY_STATE_KERNEL = .false. ! not used any further, but doesn't hurt to reset flag to .false. ... endif else NSTEP_STEADY_STATE = 0 @@ -600,15 +604,18 @@ subroutine rcp_check_parameters() stop 'Please set SAVE_ALL_SEISMOS_IN_ONE_FILE and USE_BINARY_FOR_LARGE_FILE to be .false. for noise simulation' endif -!! DK DK for gravity integrals + ! gravity integrals ! in the case of GRAVITY_INTEGRALS we should always use double precision if (GRAVITY_INTEGRALS .and. CUSTOM_REAL /= SIZE_DOUBLE) & - stop 'for GRAVITY_INTEGRALS use double precision i.e. configure the code with --enable-double-precision' + stop 'For GRAVITY_INTEGRALS use double precision i.e. configure the code with --enable-double-precision' ! adjoint simulations: seismogram output only works if each process writes out its local seismos if (WRITE_SEISMOGRAMS_BY_MAIN .and. SIMULATION_TYPE == 2) & stop 'For SIMULATION_TYPE == 2, please set WRITE_SEISMOGRAMS_BY_MAIN to .false.' + if (NTSTEP_BETWEEN_OUTPUT_SAMPLE < 1) & + stop 'Invalid NTSTEP_BETWEEN_OUTPUT_SAMPLE, must be >= 1' + !---------------------------------------------- ! ! status of implementation diff --git a/src/specfem3D/check_stability.f90 b/src/specfem3D/check_stability.f90 index eb849cee1..1c8ef1eda 100644 --- a/src/specfem3D/check_stability.f90 +++ b/src/specfem3D/check_stability.f90 @@ -192,7 +192,7 @@ subroutine check_stability() call check_norm_elastic_acoustic_from_device(b_Usolidnorm,b_Ufluidnorm,Mesh_pointer,3) endif -! this trick checks for NaN (Not a Number), which is not even equal to itself + ! this trick checks for NaN (Not a Number), which is not even equal to itself if (b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0 .or. b_Usolidnorm /= b_Usolidnorm) & call exit_MPI(myrank,'backward simulation became unstable and blew up in the solid') if (b_Ufluidnorm > STABILITY_THRESHOLD .or. b_Ufluidnorm < 0 .or. b_Ufluidnorm /= b_Ufluidnorm) & @@ -290,11 +290,13 @@ subroutine check_stability() write(IMAIN,"(' Elapsed time in hh:mm:ss = ',i6,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds write(IMAIN,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it) -! do not check before MIN_TIME_STEPS_FOR_SLOW_NODES time steps -! because the time step estimate (which is an average) may then be unreliable - if (CHECK_FOR_SLOW_NODES .and. NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. it >= MIN_TIME_STEPS_FOR_SLOW_NODES .and. & + ! do not check before MIN_TIME_STEPS_FOR_SLOW_NODES time steps + ! because the time step estimate (which is an average) may then be unreliable + if (CHECK_FOR_SLOW_NODES) then + if (NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. it >= MIN_TIME_STEPS_FOR_SLOW_NODES .and. & tCPU/dble(it) > TOLERANCE_FACTOR_FOR_SLOW_NODES * REFERENCE_TIME_PER_TIME_STEP_ON_NORMAL_NODES) & I_am_running_on_a_slow_node = .true. + endif if (SHOW_SEPARATE_RUN_INFORMATION) then write(IMAIN,*) 'Time steps done for this run = ',it_run,' out of ',nstep_run @@ -496,7 +498,7 @@ subroutine check_stability_backward() call check_norm_elastic_acoustic_from_device(b_Usolidnorm,b_Ufluidnorm,Mesh_pointer,3) endif -! this trick checks for NaN (Not a Number), which is not even equal to itself + ! this trick checks for NaN (Not a Number), which is not even equal to itself if (b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0 .or. b_Usolidnorm /= b_Usolidnorm) & call exit_MPI(myrank,'backward simulation became unstable and blew up in the solid') if (b_Ufluidnorm > STABILITY_THRESHOLD .or. b_Ufluidnorm < 0 .or. b_Ufluidnorm /= b_Ufluidnorm) & @@ -556,7 +558,7 @@ subroutine check_stability_backward() ! check stability of the code, exit if unstable ! negative values can occur with some compilers when the unstable value is greater ! than the greatest possible floating-point number of the machine -! this trick checks for NaN (Not a Number), which is not even equal to itself + ! this trick checks for NaN (Not a Number), which is not even equal to itself if (b_Usolidnorm_all > STABILITY_THRESHOLD .or. b_Usolidnorm_all < 0 .or. b_Usolidnorm_all /= b_Usolidnorm_all) & call exit_MPI(myrank,'backward simulation became unstable and blew up in the solid') if (b_Ufluidnorm_all > STABILITY_THRESHOLD .or. b_Ufluidnorm_all < 0 .or. b_Ufluidnorm_all /= b_Ufluidnorm_all) & @@ -710,19 +712,21 @@ subroutine write_timestamp_file(Usolidnorm_all,Ufluidnorm_all,b_Usolidnorm_all,b close(IOUT) -! if the run is slow and gives up, create a disk file to indicate it, so that users or batch scripts can know it. -! do not check before MIN_TIME_STEPS_FOR_SLOW_NODES time steps -! because the time step estimate (which is an average) may then be unreliable - if (CHECK_FOR_SLOW_NODES .and. NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. it >= MIN_TIME_STEPS_FOR_SLOW_NODES .and. & + ! if the run is slow and gives up, create a disk file to indicate it, so that users or batch scripts can know it. + ! do not check before MIN_TIME_STEPS_FOR_SLOW_NODES time steps + ! because the time step estimate (which is an average) may then be unreliable + if (CHECK_FOR_SLOW_NODES) then + if (NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. it >= MIN_TIME_STEPS_FOR_SLOW_NODES .and. & tCPU/dble(it) > TOLERANCE_FACTOR_FOR_SLOW_NODES * REFERENCE_TIME_PER_TIME_STEP_ON_NORMAL_NODES) then - I_am_running_on_a_slow_node = .true. -! rank is between 0 and the max rank - 1, for the earthquake here we start at 1, i.e. we use mygroup + 1 rather than mygroup - write(outputname,"('/slow_run_on_rank_',i6.6,'_giving_up_for_earthquake_run_',i6.6,'_at_time_step_',i6.6)") & + I_am_running_on_a_slow_node = .true. + ! rank is between 0 and the max rank - 1, for the earthquake here we start at 1, i.e. we use mygroup + 1 rather than mygroup + write(outputname,"('/slow_run_on_rank_',i6.6,'_giving_up_for_earthquake_run_',i6.6,'_at_time_step_',i6.6)") & myrank,mygroup + 1,it - open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown',action='write') -! write outputname as the information message, since it contains all the information needed + open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown',action='write') + ! write outputname as the information message, since it contains all the information needed write(IOUT,*) outputname - close(IOUT) + close(IOUT) + endif endif end subroutine write_timestamp_file diff --git a/src/specfem3D/compute_add_sources.f90 b/src/specfem3D/compute_add_sources.f90 index d4139675b..4f3dfb825 100644 --- a/src/specfem3D/compute_add_sources.f90 +++ b/src/specfem3D/compute_add_sources.f90 @@ -38,7 +38,7 @@ subroutine compute_add_sources() double precision :: stf real(kind=CUSTOM_REAL) :: stf_used ! for gpu - double precision, dimension(NSOURCES) :: stf_pre_compute + !double precision, dimension(NSOURCES) :: stf_pre_compute ! obsolete double precision, external :: get_stf_viscoelastic @@ -51,7 +51,8 @@ subroutine compute_add_sources() ! note: the LDDRK scheme updates displacement after the stiffness computations and ! after adding boundary/coupling/source terms. ! thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme - ! when entering this routine. we therefore at an additional -DT to have the corresponding timing for the source. + ! when entering this routine. we therefore at an additional -DT to have the corresponding timing + ! for the source. time_t = dble(it-1-1)*DT + dble(C_LDDRK(istage))*DT - t0 else time_t = dble(it-1)*DT - t0 @@ -103,20 +104,19 @@ subroutine compute_add_sources() else ! on GPU - ! prepares buffer with source time function values, to be copied onto GPU - do isource = 1,NSOURCES - ! sets current time for this source - timeval = time_t - tshift_src(isource) - - ! determines source time function value - stf = get_stf_viscoelastic(timeval,isource,it) - - ! stores current stf values - stf_pre_compute(isource) = stf - enddo + ! we avoid these copies as they synchronize streams and kernels by using local source time function arrays instead + !! prepares buffer with source time function values, to be copied onto GPU + !do isource = 1,NSOURCES + ! ! sets current time for this source + ! timeval = time_t - tshift_src(isource) + ! ! determines source time function value + ! stf = get_stf_viscoelastic(timeval,isource,it) + ! ! stores current stf values + ! stf_pre_compute(isource) = stf + !enddo ! adds sources: only implements SIMTYPE=1 and NOISE_TOM = 0 - call compute_add_sources_gpu(Mesh_pointer,NSOURCES,stf_pre_compute) + call compute_add_sources_gpu(Mesh_pointer,it,istage) endif end subroutine compute_add_sources @@ -360,7 +360,7 @@ subroutine compute_add_sources_backward() double precision :: stf real(kind=CUSTOM_REAL) :: stf_used ! for gpu - double precision, dimension(NSOURCES) :: stf_pre_compute + !double precision, dimension(NSOURCES) :: stf_pre_compute ! obsolete double precision, external :: get_stf_viscoelastic @@ -414,7 +414,8 @@ subroutine compute_add_sources_backward() ! note: the LDDRK scheme updates displacement after the stiffness computations and ! after adding boundary/coupling/source terms. ! thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme - ! when entering this routine. we therefore at an additional -DT to have the corresponding timing for the source. + ! when entering this routine. we therefore at an additional -DT to have the corresponding timing + ! for the source. if (UNDO_ATTENUATION) then ! stepping moves forward from snapshot position time_t = dble(NSTEP-it_tmp-1)*DT + dble(C_LDDRK(istage))*DT - t0 @@ -462,20 +463,19 @@ subroutine compute_add_sources_backward() else ! on GPU - ! prepares buffer with source time function values, to be copied onto GPU - do isource = 1,NSOURCES - ! sets current time for this source - timeval = time_t - tshift_src(isource) - - ! determines source time function value - stf = get_stf_viscoelastic(timeval,isource,it_tmp) - - ! stores current stf values - stf_pre_compute(isource) = stf - enddo + ! we avoid these copies as they synchronize streams and kernels by using local source time function arrays instead + !! prepares buffer with source time function values, to be copied onto GPU + !do isource = 1,NSOURCES + ! ! sets current time for this source + ! timeval = time_t - tshift_src(isource) + ! ! determines source time function value + ! stf = get_stf_viscoelastic(timeval,isource,it_tmp) + ! ! stores current stf values + ! stf_pre_compute(isource) = stf + !enddo ! adds sources: only implements SIMTYPE=3 (and NOISE_TOM = 0) - call compute_add_sources_backward_gpu(Mesh_pointer,NSOURCES,stf_pre_compute) + call compute_add_sources_backward_gpu(Mesh_pointer,it_tmp,istage) endif end subroutine compute_add_sources_backward diff --git a/src/specfem3D/compute_forces_acoustic_calling_routine.F90 b/src/specfem3D/compute_forces_acoustic_calling_routine.F90 index 60abb381b..33968d80c 100644 --- a/src/specfem3D/compute_forces_acoustic_calling_routine.F90 +++ b/src/specfem3D/compute_forces_acoustic_calling_routine.F90 @@ -53,7 +53,8 @@ subroutine compute_forces_acoustic() ! note: the LDDRK scheme updates displacement after the stiffness computations and ! after adding boundary/coupling/source terms. ! thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme - ! when entering this routine. we therefore at an additional -DT to have the corresponding timing for the source. + ! when entering this routine. we therefore at an additional -DT to have the corresponding timing + ! for the source. timeval = real((dble(it-1-1)*DT + dble(C_LDDRK(istage))*DT - t0)*scale_t_inv, kind=CUSTOM_REAL) else timeval = real((dble(it-1)*DT - t0)*scale_t_inv, kind=CUSTOM_REAL) @@ -265,7 +266,8 @@ subroutine compute_forces_acoustic_backward() ! note: the LDDRK scheme updates displacement after the stiffness computations and ! after adding boundary/coupling/source terms. ! thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme - ! when entering this routine. we therefore at an additional -DT to have the corresponding timing for the source. + ! when entering this routine. we therefore at an additional -DT to have the corresponding timing + ! for the source. if (UNDO_ATTENUATION) then ! stepping moves forward from snapshot position b_timeval = real((dble(NSTEP-it_tmp-1)*DT + dble(C_LDDRK(istage))*DT - t0)*scale_t_inv, kind=CUSTOM_REAL) diff --git a/src/specfem3D/compute_seismograms.F90 b/src/specfem3D/compute_seismograms.F90 index d7833b0bf..2a8adec7e 100644 --- a/src/specfem3D/compute_seismograms.F90 +++ b/src/specfem3D/compute_seismograms.F90 @@ -31,7 +31,7 @@ subroutine compute_seismograms(nglob,displ,seismo_current,seismograms) use constants_solver use specfem_par, only: & - NTSTEP_BETWEEN_OUTPUT_SEISMOS, & + nlength_seismogram, & nrec_local,nu_rec,ispec_selected_rec,number_receiver_global, & scale_displ,hxir_store,hetar_store,hgammar_store @@ -44,7 +44,7 @@ subroutine compute_seismograms(nglob,displ,seismo_current,seismograms) integer,intent(in) :: seismo_current - real(kind=CUSTOM_REAL), dimension(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),intent(out) :: & + real(kind=CUSTOM_REAL), dimension(NDIM,nrec_local,nlength_seismogram),intent(out) :: & seismograms ! local parameters @@ -98,7 +98,6 @@ subroutine compute_seismograms_adjoint(displ_crust_mantle, & eps_trace_over_3_crust_mantle, & epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, & epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, & - it_adj_written, & moment_der,sloc_der,stshift_der,shdur_der, & seismograms) @@ -108,7 +107,9 @@ subroutine compute_seismograms_adjoint(displ_crust_mantle, & NSPEC_CRUST_MANTLE_STRAIN_ONLY,NSPEC_CRUST_MANTLE_STR_OR_ATT use specfem_par, only: & - NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS,UNDO_ATTENUATION, & + NSTEP,NTSTEP_BETWEEN_OUTPUT_SAMPLE, & + nlength_seismogram,seismo_current, & + UNDO_ATTENUATION, & nrec_local, & nu_source,Mxx,Myy,Mzz,Mxy,Mxz,Myz, & hxir_store,hpxir_store,hetar_store,hpetar_store,hgammar_store,hpgammar_store, & @@ -134,13 +135,11 @@ subroutine compute_seismograms_adjoint(displ_crust_mantle, & epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, & epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle - integer,intent(in) :: it_adj_written - real(kind=CUSTOM_REAL), dimension(NDIM,NDIM,nrec_local),intent(inout) :: moment_der real(kind=CUSTOM_REAL), dimension(NDIM,nrec_local),intent(inout) :: sloc_der real(kind=CUSTOM_REAL), dimension(nrec_local),intent(inout) :: stshift_der, shdur_der - real(kind=CUSTOM_REAL), dimension(NDIM*NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),intent(out) :: & + real(kind=CUSTOM_REAL), dimension(NDIM*NDIM,nrec_local,nlength_seismogram),intent(out) :: & seismograms ! local parameters @@ -246,14 +245,14 @@ subroutine compute_seismograms_adjoint(displ_crust_mantle, & eps_loc_new(:,:) = matmul(matmul(nu_source(:,:,irec),eps_loc(:,:)), transpose(nu_source(:,:,irec))) ! distinguish between single and double precision for reals - seismograms(1,irec_local,it-it_adj_written) = real(eps_loc_new(1,1), kind=CUSTOM_REAL) - seismograms(2,irec_local,it-it_adj_written) = real(eps_loc_new(2,2), kind=CUSTOM_REAL) - seismograms(3,irec_local,it-it_adj_written) = real(eps_loc_new(3,3), kind=CUSTOM_REAL) - seismograms(4,irec_local,it-it_adj_written) = real(eps_loc_new(1,2), kind=CUSTOM_REAL) - seismograms(5,irec_local,it-it_adj_written) = real(eps_loc_new(1,3), kind=CUSTOM_REAL) - seismograms(6,irec_local,it-it_adj_written) = real(eps_loc_new(2,3), kind=CUSTOM_REAL) - - seismograms(7:9,irec_local,it-it_adj_written) = real(scale_displ*(nu_source(:,1,irec)*uxd + & + seismograms(1,irec_local,seismo_current) = real(eps_loc_new(1,1), kind=CUSTOM_REAL) + seismograms(2,irec_local,seismo_current) = real(eps_loc_new(2,2), kind=CUSTOM_REAL) + seismograms(3,irec_local,seismo_current) = real(eps_loc_new(3,3), kind=CUSTOM_REAL) + seismograms(4,irec_local,seismo_current) = real(eps_loc_new(1,2), kind=CUSTOM_REAL) + seismograms(5,irec_local,seismo_current) = real(eps_loc_new(1,3), kind=CUSTOM_REAL) + seismograms(6,irec_local,seismo_current) = real(eps_loc_new(2,3), kind=CUSTOM_REAL) + + seismograms(7:9,irec_local,seismo_current) = real(scale_displ*(nu_source(:,1,irec)*uxd + & nu_source(:,2,irec)*uyd + & nu_source(:,3,irec)*uzd), & kind=CUSTOM_REAL) @@ -281,12 +280,15 @@ subroutine compute_seismograms_adjoint(displ_crust_mantle, & gammax_crust_mantle(1,1,1,ispec),gammay_crust_mantle(1,1,1,ispec),gammaz_crust_mantle(1,1,1,ispec)) ! reverse time - timeval = dble(NSTEP-it)*DT-t0-tshift_src(irec) + timeval = dble(NSTEP-it) * DT - t0 - tshift_src(irec) ! gets source-time function value - stf = get_stf_viscoelastic(timeval,irec,it) + !#TODO: double-check if time at it or NSTEP-it+1 + ! since adjoint simulation uses reversed adjoint sources, source time seems to correspond to NSTEP-(it-1)==NSTEP-it+1 + ! therefore, when reading in an external STF, the index would be `NSTEP-it+1` rather than `it` + stf = get_stf_viscoelastic(timeval,irec,NSTEP-it+1) - stf_deltat = stf * deltat + stf_deltat = real(stf * deltat * NTSTEP_BETWEEN_OUTPUT_SAMPLE,kind=CUSTOM_REAL) ! moment derivatives moment_der(:,:,irec_local) = moment_der(:,:,irec_local) + eps_s(:,:) * stf_deltat diff --git a/src/specfem3D/get_attenuation.f90 b/src/specfem3D/get_attenuation.f90 index 771087ab5..4a61c72c4 100644 --- a/src/specfem3D/get_attenuation.f90 +++ b/src/specfem3D/get_attenuation.f90 @@ -383,8 +383,18 @@ subroutine get_attenuation_scale_factor(T_c_source, tau_mu, tau_sigma, Q_mu, sca !print *,'debug: factor scale mu0,mu = ',factor_scale_mu0,factor_scale_mu,'approximated Q = A/B',a_val/b_val !--- check that the correction factor is close to one - if (scale_factor < 0.8d0 .or. scale_factor > 1.2d0) then - print *,'Error: incorrect scale factor: ', scale_factor,'scale mu',factor_scale_mu,'scale mu0',factor_scale_mu0 + ! note: for very coarse global simulations, the reference frequency (usually 1Hz for global models) + ! and center frequency of the simulation can be far apart. + ! the scale_factor limiting range here is somewhat set arbitrarily, and enlarged to [0.5,1.5] + ! to allow for very coarse global simulation tests with sponge attenuation. + if (scale_factor < 0.5d0 .or. scale_factor > 1.5d0) then + print *,'Error: incorrect scale factor: ', scale_factor + print *,' scale factor: ', scale_factor,' should be between 0.5 to 1.5' + print *,' factor scale_mu = ',factor_scale_mu,' factor scale_mu0 = ',factor_scale_mu0 + print *,' Q value = ',Q_mu + print *,' central period = ',T_c_source * scale_t,' frequency = ',f_c_source / scale_t + print *,' model frequency = ',f_0_model / scale_t + print *,'Please check your reference frequency ATTENUATION_f0_REFERENCE in file constants.h' call exit_MPI(myrank,'incorrect correction factor in attenuation model') endif diff --git a/src/specfem3D/iterate_time.F90 b/src/specfem3D/iterate_time.F90 index 3d5e199ba..eaa94f442 100644 --- a/src/specfem3D/iterate_time.F90 +++ b/src/specfem3D/iterate_time.F90 @@ -90,6 +90,9 @@ subroutine iterate_time() close(IOUT) endif + ! synchronizes GPU kernels + if (GPU_MODE) call gpu_synchronize() + ! initialize variables for writing seismograms seismo_offset = it_begin-1 seismo_current = 0 diff --git a/src/specfem3D/iterate_time_undoatt.F90 b/src/specfem3D/iterate_time_undoatt.F90 index a81811fef..65a00f4f9 100644 --- a/src/specfem3D/iterate_time_undoatt.F90 +++ b/src/specfem3D/iterate_time_undoatt.F90 @@ -253,6 +253,9 @@ subroutine iterate_time_undoatt() close(IOUT) endif + ! synchronizes GPU kernels + if (GPU_MODE) call gpu_synchronize() + ! initializes variables for writing seismograms seismo_offset = it_begin-1 seismo_current = 0 @@ -477,7 +480,7 @@ subroutine iterate_time_undoatt() it_of_buffer = it_of_buffer + 1 if (GPU_MODE) then -#if defined(USE_CUDA) || defined(USE_OPENCL) +#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_OPENCL) if (it_of_buffer >= 2) then call unregister_host_array(b_displ_cm_store_buffer(:,:, it_of_buffer-1)) endif @@ -593,7 +596,7 @@ subroutine iterate_time_undoatt() ! adjoint simulations: kernels ! attention: for GPU_MODE and ANISOTROPIC_KL it is necessary to use resort_array (see lines 442-445) if (mod(it, ntstep_kl) == 0) then -#if defined(USE_CUDA) || defined(USE_OPENCL) +#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_OPENCL) if (GPU_MODE) then call unregister_host_array(b_displ_cm_store_buffer(:,:, it_of_buffer+1)) if (it_of_buffer > 0) then diff --git a/src/specfem3D/locate_receivers.f90 b/src/specfem3D/locate_receivers.f90 index e408d7d79..af8ba5197 100644 --- a/src/specfem3D/locate_receivers.f90 +++ b/src/specfem3D/locate_receivers.f90 @@ -746,11 +746,10 @@ subroutine read_receiver_locations() ! close receiver file close(IIN) -! BS BS begin -! In case that the same station and network name appear twice (or more times) in the STATIONS -! file, problems occur, as two (or more) seismograms are written (with mode -! "append") to a file with same name. The philosophy here is to accept multiple -! appearances and to just add a count to the station name in this case. + ! In case that the same station and network name appear twice (or more times) in the STATIONS + ! file, problems occur, as two (or more) seismograms are written (with mode + ! "append") to a file with same name. The philosophy here is to accept multiple + ! appearances and to just add a count to the station name in this case. allocate(station_duplet(nrec),stat=ier) if (ier /= 0 ) call exit_MPI(myrank,'Error allocating station_duplet array') @@ -771,7 +770,6 @@ subroutine read_receiver_locations() enddo enddo deallocate(station_duplet) -! BS BS end ! if receivers can not be buried, sets depth to zero if (.not. RECEIVERS_CAN_BE_BURIED ) stbur(:) = 0.d0 diff --git a/src/specfem3D/prepare_gpu.f90 b/src/specfem3D/prepare_gpu.f90 index d78bfca39..dca38b28b 100644 --- a/src/specfem3D/prepare_gpu.f90 +++ b/src/specfem3D/prepare_gpu.f90 @@ -41,6 +41,10 @@ subroutine prepare_GPU() ! local parameters real :: free_mb,used_mb,total_mb logical :: USE_3D_ATTENUATION_ARRAYS + ! local source arrays + real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: stf_local,b_stf_local + real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: sourcearrays_local + integer, dimension(:), allocatable :: ispec_selected_source_local ! checks if anything to do if (.not. GPU_MODE) return @@ -62,12 +66,17 @@ subroutine prepare_GPU() ! evaluates memory required call memory_eval_gpu() + ! creates local source arrays for gpu simulations + call prepare_local_gpu_source_arrays() + ! prepares general fields on GPU call prepare_constants_device(Mesh_pointer,myrank,NGLLX, & hprime_xx,hprimewgll_xx, & wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, & - NSOURCES, nsources_local, sourcearrays, & - islice_selected_source,ispec_selected_source, & + NSOURCES, nsources_local, & + sourcearrays_local, stf_local, b_stf_local, & + ispec_selected_source_local, & + ispec_selected_source, & nrec, nrec_local, number_receiver_global, & islice_selected_rec,ispec_selected_rec, & NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, & @@ -92,7 +101,10 @@ subroutine prepare_GPU() deltat, & GPU_ASYNC_COPY, & hxir_store,hetar_store,hgammar_store,nu_rec, & - SAVE_SEISMOGRAMS_STRAIN,CUSTOM_REAL) + nlength_seismogram, & + SAVE_SEISMOGRAMS_STRAIN,CUSTOM_REAL, & + USE_LDDRK, & + NSTEP,NSTAGE_TIME_SCHEME) call synchronize_all() if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then @@ -416,6 +428,9 @@ subroutine prepare_GPU() call flush_IMAIN() endif + ! frees temporary arrays + deallocate(stf_local,b_stf_local,sourcearrays_local,ispec_selected_source_local) + ! synchronizes processes call synchronize_all() @@ -426,7 +441,7 @@ subroutine memory_eval_gpu() implicit none ! local parameters - double precision :: memory_size + double precision :: memory_size,memory_size_max integer,parameter :: NGLL2 = NGLLSQUARE integer,parameter :: NGLL3 = 125 integer,parameter :: NGLL3_PADDED = 128 @@ -445,14 +460,32 @@ subroutine memory_eval_gpu() ! sources if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then + ! full NSOURCES not needed anymore... ! d_sourcearrays - memory_size = memory_size + NGLL3 * NSOURCES * NDIM * dble(CUSTOM_REAL) + !memory_size = memory_size + NGLL3 * NSOURCES * NDIM * dble(CUSTOM_REAL) + ! d_islice_selected_source,d_ispec_selected_source + !memory_size = memory_size + 2.d0 * NSOURCES * dble(SIZE_INTEGER) + ! local sources only + ! d_sourcearrays_local + memory_size = memory_size + NDIM * NGLL3 * nsources_local * dble(CUSTOM_REAL) + ! d_stf_local + memory_size = memory_size + nsources_local * NSTEP * NSTAGE_TIME_SCHEME * dble(CUSTOM_REAL) + if (SIMULATION_TYPE == 3 .and. USE_LDDRK .and. (.not. UNDO_ATTENUATION)) then + ! d_b_stf_local + memory_size = memory_size + nsources_local * NSTEP * NSTAGE_TIME_SCHEME * dble(CUSTOM_REAL) + endif + ! d_ispec_selected_source_local + memory_size = memory_size + nsources_local * dble(SIZE_INTEGER) + endif + if (SIMULATION_TYPE == 2) then + ! d_ispec_selected_source + memory_size = memory_size + NSOURCES * dble(SIZE_INTEGER) endif - ! d_islice_selected_source,d_ispec_selected_source - memory_size = memory_size + 2.d0 * NSOURCES * dble(SIZE_INTEGER) ! receivers - !d_number_receiver_global + ! d_seismograms + memory_size = memory_size + NDIM * nrec_local * nlength_seismogram * dble(CUSTOM_REAL) + ! d_number_receiver_global memory_size = memory_size + nrec_local * dble(SIZE_INTEGER) ! d_station_seismo_field memory_size = memory_size + NDIM * NGLL3 * nrec_local * dble(CUSTOM_REAL) @@ -462,7 +495,6 @@ subroutine memory_eval_gpu() endif ! d_ispec_selected_rec memory_size = memory_size + nrec * dble(SIZE_INTEGER) - ! d_adj_source_adjoint memory_size = memory_size + NDIM * nadj_rec_local * dble(CUSTOM_REAL) @@ -649,14 +681,181 @@ subroutine memory_eval_gpu() ! poor estimate for kernel simulations... if (SIMULATION_TYPE == 3) memory_size = 2.d0 * memory_size + ! main process gets maximum for user output + call max_all_dp(memory_size,memory_size_max) + ! user output if (myrank == 0) then write(IMAIN,*) - write(IMAIN,*) " minimum memory requested : ",sngl(memory_size / 1024.d0 / 1024.d0),"MB per process" + write(IMAIN,*) " minimum memory requested : ",sngl(memory_size_max / 1024.d0 / 1024.d0),"MB per process" write(IMAIN,*) call flush_IMAIN() endif end subroutine memory_eval_gpu + !------------------------------------------------------------------------------------------- + + subroutine prepare_local_gpu_source_arrays() + + implicit none + ! local parameters + integer :: isource_local,isource,it_tmp,istage,ier + real(kind=CUSTOM_REAL) :: stf_used + double precision :: timeval,time_t + double precision :: stf + double precision, external :: get_stf_viscoelastic + + ! allocates source arrays + if (nsources_local > 0) then + ! prepares source arrays for local sources + allocate(stf_local(nsources_local,NSTEP,NSTAGE_TIME_SCHEME), & + sourcearrays_local(NDIM,NGLLX,NGLLY,NGLLZ,nsources_local), & + ispec_selected_source_local(nsources_local), stat=ier) + else + ! dummy arrays + allocate(stf_local(1,1,1), & + sourcearrays_local(1,1,1,1,1), & + ispec_selected_source_local(1), stat=ier) + endif + if (ier /= 0) stop 'Error allocating local source arrays' + + stf_local(:,:,:) = 0.0_CUSTOM_REAL + sourcearrays_local(:,:,:,:,:) = 0.0_CUSTOM_REAL + ispec_selected_source_local(:) = 0 + + ! for backward simulations w/ LDDRK + if (nsources_local > 0 .and. SIMULATION_TYPE == 3 .and. USE_LDDRK .and. (.not. UNDO_ATTENUATION)) then + allocate(b_stf_local(nsources,NSTEP,NSTAGE_TIME_SCHEME),stat=ier) + else + ! dummy + allocate(b_stf_local(1,1,1),stat=ier) + endif + if (ier /= 0) stop 'Error allocating local source b_stf_local arrays' + b_stf_local(:,:,:) = 0.0_CUSTOM_REAL + + ! checks if anything to do + if (NOISE_TOMOGRAPHY /= 0) return + + ! loops over all sources to gather local ones + isource_local = 0 + do isource = 1,NSOURCES + if (myrank == islice_selected_source(isource)) then + ! counter + isource_local = isource_local + 1 + + ! checks + if (isource_local > nsources_local) call exit_MPI(myrank,'Error local source index exceeds nsources_local') + + ! gets local source element + ispec_selected_source_local(isource_local) = ispec_selected_source(isource) + sourcearrays_local(:,:,:,:,isource_local) = sourcearrays(:,:,:,:,isource) + + ! forward + do it_tmp = 1,NSTEP + ! time for simulation + do istage = 1, NSTAGE_TIME_SCHEME ! is equal to 1 if Newmark because only one stage then + ! sets current initial time + if (USE_LDDRK) then + ! LDDRK + ! note: the LDDRK scheme updates displacement after the stiffness computations and + ! after adding boundary/coupling/source terms. + ! thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme + ! when entering this routine. we therefore at an additional -DT to have the corresponding timing + ! for the source. + time_t = dble(it_tmp-1-1)*DT + dble(C_LDDRK(istage))*DT - t0 + else + time_t = dble(it_tmp-1)*DT - t0 + endif + + ! sets current time for this source + timeval = time_t - tshift_src(isource) + + ! source time function value (in range [-1,1] + stf = get_stf_viscoelastic(timeval,isource,it_tmp) + + ! distinguishes between single and double precision for reals + stf_used = real(stf,kind=CUSTOM_REAL) + + ! stores local source time function + ! we use an ordering (isource,it,istage) to make it easier to loop over many local sources + ! for a given time in the gpu kernel + stf_local(isource_local,it_tmp,istage) = stf_used + enddo ! istage + enddo + + ! backward + if (SIMULATION_TYPE == 3) then + ! since there is a different forward and backward timing for the LDDRK scheme, + ! we allocate and compute the source time function again for kernel runs in `b_stf_local(:,:,:)`. + ! + ! TODO in future: we might want to allocate only 1 source time function array `stf_local` and + ! make sure that the indexing is producing the same stf values as needed. + do it_tmp = 1,NSTEP + do istage = 1, NSTAGE_TIME_SCHEME ! is equal to 1 if Newmark because only one stage then + ! note on backward/reconstructed wavefields: + ! time for b_displ( it ) corresponds to (NSTEP - (it-1) - 1 )*DT - t0 ... + ! as we start with saved wavefields b_displ( 1 ) = displ( NSTEP ) which correspond + ! to a time (NSTEP - 1)*DT - t0 + ! (see sources for simulation_type 1 and seismograms) + ! + ! now, at the beginning of the time loop, the numerical Newmark time scheme updates + ! the wavefields, that is b_displ( it=1) would correspond to time (NSTEP -1 - 1)*DT - t0. + ! however, we read in the backward/reconstructed wavefields at the end of the Newmark time scheme + ! in the first (it=1) time loop. + ! this leads to the timing (NSTEP-(it-1)-1)*DT-t0-tshift_src for the source time function here + ! + ! sets current initial time + if (USE_LDDRK) then + ! LDDRK + ! note: the LDDRK scheme updates displacement after the stiffness computations and + ! after adding boundary/coupling/source terms. + ! thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme + ! when entering this routine. we therefore at an additional -DT to have the corresponding timing + ! for the source. + if (UNDO_ATTENUATION) then + ! stepping moves forward from snapshot position + time_t = dble(NSTEP-it_tmp-1)*DT + dble(C_LDDRK(istage))*DT - t0 + else + ! stepping backwards + time_t = dble(NSTEP-it_tmp-1)*DT - dble(C_LDDRK(istage))*DT - t0 + endif + else + time_t = dble(NSTEP-it_tmp)*DT - t0 + endif + + ! sets current time for this source + timeval = time_t - tshift_src(isource) + + ! determines source time function value + stf = get_stf_viscoelastic(timeval,isource,it_tmp) + + ! distinguishes between single and double precision for reals + stf_used = real(stf,kind=CUSTOM_REAL) + + ! stores local source time function + if (USE_LDDRK .and. (.not. UNDO_ATTENUATION)) then + ! only needed to store for LDDRK stepping backwards, as LDDRK uses non-symmetric step lengths (C_LDDRK(istage)) + ! for all other cases, we can use the forward stf_local array with corresponding indexing. + ! for example, Newmark stepping: + ! backward time_t = dble(NSTEP-it_tmp)*DT - t0 with it_tmp 1..NSTEP + ! forward time_t = dble(it_tmp-1)*DT - t0 with it_tmp 1..NSTEP + ! -> backward indexing: + ! it_tmp_backward == 1 -> it_tmp_forward = NSTEP + ! it_tmp_backward == 2 -> it_tmp_forward = NSTEP-1 + ! .. + ! it_tmp_backward == NSTEP -> it_tmp_forward = 1 + ! and b_stf_local == stf_local(istage, NSTEP-(it_tmp_backward-1), isource_local) + b_stf_local(isource_local,it_tmp,istage) = stf_used + endif + enddo ! istage + enddo + endif ! backward + + endif + enddo ! NSOURCES + + end subroutine prepare_local_gpu_source_arrays + + end subroutine prepare_GPU diff --git a/src/specfem3D/setup_sources_receivers.f90 b/src/specfem3D/setup_sources_receivers.f90 index e8e9c9cc4..d51128d09 100644 --- a/src/specfem3D/setup_sources_receivers.f90 +++ b/src/specfem3D/setup_sources_receivers.f90 @@ -707,7 +707,7 @@ subroutine setup_sources() endif ! sources - ! BS BS moved open statement and writing of first lines into sr.vtk before the + ! moved open statement and writing of first lines into sr.vtk before the ! call to locate_sources, where further write statements to that file follow if (myrank == 0) then ! write source and receiver VTK files for Paraview @@ -968,6 +968,39 @@ subroutine setup_timesteps() endif endif + ! make sure NSTEP is a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE + ! if not, increase it a little bit, to the next multiple + if (mod(NSTEP,NTSTEP_BETWEEN_OUTPUT_SAMPLE) /= 0) then + if (NOISE_TOMOGRAPHY /= 0) then + if (myrank == 0) then + print *,'Noise simulation: Invalid number of NSTEP = ',NSTEP + print *,'Must be a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE = ',NTSTEP_BETWEEN_OUTPUT_SAMPLE + endif + stop 'Error: NSTEP must be a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE' + else + NSTEP = (NSTEP/NTSTEP_BETWEEN_OUTPUT_SAMPLE + 1)*NTSTEP_BETWEEN_OUTPUT_SAMPLE + ! user output + if (myrank == 0) then + print * + print *,'NSTEP is not a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE' + print *,'thus increasing it automatically to the next multiple, which is ',NSTEP + print * + endif + endif + endif + + ! output seismograms at least once at the end of the simulation + NTSTEP_BETWEEN_OUTPUT_SEISMOS = min(NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS) + + ! make sure NSTEP_BETWEEN_OUTPUT_SEISMOS is a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE + if (mod(NTSTEP_BETWEEN_OUTPUT_SEISMOS,NTSTEP_BETWEEN_OUTPUT_SAMPLE) /= 0) then + if (myrank == 0) then + print *,'Invalid number of NTSTEP_BETWEEN_OUTPUT_SEISMOS = ',NTSTEP_BETWEEN_OUTPUT_SEISMOS + print *,'Must be a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE = ',NTSTEP_BETWEEN_OUTPUT_SAMPLE + endif + stop 'Error: NTSTEP_BETWEEN_OUTPUT_SEISMOS must be a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE' + endif + ! time loop increments end it_end = NSTEP @@ -1207,12 +1240,26 @@ subroutine setup_receivers() endif ! seismograms + ! check if we need to save seismos + if (SIMULATION_TYPE == 3 .and. (.not. SAVE_SEISMOGRAMS_IN_ADJOINT_RUN)) then + do_save_seismograms = .false. + else + do_save_seismograms = .true. + endif + + ! sets local receivers to zero if no seismogram needs to be saved + if (.not. do_save_seismograms) nrec_local = 0 + + ! seismogram array length (to write out time portions of the full seismograms) + nlength_seismogram = NTSTEP_BETWEEN_OUTPUT_SEISMOS / NTSTEP_BETWEEN_OUTPUT_SAMPLE + ! gather from secondary processes on main tmp_rec_local_all(:) = 0 tmp_rec_local_all(0) = nrec_local if (NPROCTOT_VAL > 1) then call gather_all_singlei(nrec_local,tmp_rec_local_all,NPROCTOT_VAL) endif + ! user output if (myrank == 0) then ! determines maximum number of local receivers and corresponding rank @@ -1221,20 +1268,29 @@ subroutine setup_receivers() maxproc = maxloc(tmp_rec_local_all(:)) - 1 ! seismograms array size in MB if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then - ! seismograms need seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS) - sizeval = dble(maxrec) * dble(NDIM * NTSTEP_BETWEEN_OUTPUT_SEISMOS * CUSTOM_REAL / 1024. / 1024. ) + ! seismograms need seismograms(NDIM,nrec_local,nlength_seismogram) + sizeval = dble(maxrec) * dble(NDIM * nlength_seismogram * CUSTOM_REAL / 1024. / 1024. ) else - ! adjoint seismograms need seismograms(NDIM*NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS) - sizeval = dble(maxrec) * dble(NDIM * NDIM * NTSTEP_BETWEEN_OUTPUT_SEISMOS * CUSTOM_REAL / 1024. / 1024. ) + ! adjoint seismograms need seismograms(NDIM*NDIM,nrec_local,nlength_seismogram) + sizeval = dble(maxrec) * dble(NDIM * NDIM * nlength_seismogram * CUSTOM_REAL / 1024. / 1024. ) endif + ! outputs info write(IMAIN,*) 'seismograms:' - if (WRITE_SEISMOGRAMS_BY_MAIN) then - write(IMAIN,*) ' seismograms written by main process only' + if (do_save_seismograms) then + if (WRITE_SEISMOGRAMS_BY_MAIN) then + write(IMAIN,*) ' seismograms written by main process only' + else + write(IMAIN,*) ' seismograms written by all processes' + endif else - write(IMAIN,*) ' seismograms written by all processes' + write(IMAIN,*) ' seismograms will not be saved' endif + write(IMAIN,*) ' Total number of simulation steps (NSTEP) = ',NSTEP write(IMAIN,*) ' writing out seismograms at every NTSTEP_BETWEEN_OUTPUT_SEISMOS = ',NTSTEP_BETWEEN_OUTPUT_SEISMOS + write(IMAIN,*) ' number of subsampling steps for seismograms = ',NTSTEP_BETWEEN_OUTPUT_SAMPLE + write(IMAIN,*) ' Total number of samples for seismograms = ',NSTEP/NTSTEP_BETWEEN_OUTPUT_SAMPLE + write(IMAIN,*) write(IMAIN,*) ' maximum number of local receivers is ',maxrec,' in slice ',maxproc(1) write(IMAIN,*) ' size of maximum seismogram array = ', sngl(sizeval),'MB' write(IMAIN,*) ' = ', sngl(sizeval/1024.d0),'GB' @@ -1777,11 +1833,11 @@ subroutine setup_receivers_precompute_intp() ! allocates seismogram array if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then - allocate(seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier) + allocate(seismograms(NDIM,nrec_local,nlength_seismogram),stat=ier) if (ier /= 0) stop 'Error while allocating seismograms' else ! adjoint seismograms - allocate(seismograms(NDIM*NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier) + allocate(seismograms(NDIM*NDIM,nrec_local,nlength_seismogram),stat=ier) if (ier /= 0) stop 'Error while allocating adjoint seismograms' ! allocates Frechet derivatives array @@ -1796,15 +1852,15 @@ subroutine setup_receivers_precompute_intp() stshift_der(:) = 0._CUSTOM_REAL shdur_der(:) = 0._CUSTOM_REAL endif + ! initializes seismograms seismograms(:,:,:) = 0._CUSTOM_REAL - ! adjoint seismograms - it_adj_written = 0 + else ! dummy arrays ! allocates dummy array since we need it to pass as argument e.g. in write_seismograms() routine ! note: nrec_local is zero, Fortran 90/95 should allow zero-sized array allocation... - allocate(seismograms(NDIM,0,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier) + allocate(seismograms(NDIM,0,nlength_seismogram),stat=ier) if (ier /= 0) stop 'Error while allocating zero seismograms' ! dummy allocation allocate(hxir_store(1,1), & @@ -1816,7 +1872,7 @@ subroutine setup_receivers_precompute_intp() ! strain seismograms if (SAVE_SEISMOGRAMS_STRAIN) then if (nrec_local > 0) then - allocate(seismograms_eps(6,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier) + allocate(seismograms_eps(6,nrec_local,nlength_seismogram),stat=ier) if (ier /= 0) stop 'Error while allocating strain seismograms' seismograms_eps(:,:,:) = 0._CUSTOM_REAL else @@ -1928,7 +1984,7 @@ subroutine setup_receivers_precompute_intp() ! ASDF seismograms if (OUTPUT_SEISMOS_ASDF) then - if (.not. (SIMULATION_TYPE == 3 .and. (.not. SAVE_SEISMOGRAMS_IN_ADJOINT_RUN)) ) then + if (do_save_seismograms) then ! initializes the ASDF data structure by allocating arrays call init_asdf_data(nrec_local) call synchronize_all() diff --git a/src/specfem3D/specfem3D_par.F90 b/src/specfem3D/specfem3D_par.F90 index d52c25681..b1b6e73aa 100644 --- a/src/specfem3D/specfem3D_par.F90 +++ b/src/specfem3D/specfem3D_par.F90 @@ -352,16 +352,15 @@ module specfem_par !----------------------------------------------------------------- ! seismograms - integer :: it_begin,it_end real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: seismograms + + integer :: nlength_seismogram integer :: seismo_offset, seismo_current + logical :: do_save_seismograms ! strain seismograms real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: seismograms_eps - ! adjoint seismograms - integer :: it_adj_written - ! for SAC headers for seismograms integer :: yr_SAC,jda_SAC,mo_SAC,da_SAC,ho_SAC,mi_SAC double precision :: sec_SAC @@ -469,6 +468,7 @@ module specfem_par !----------------------------------------------------------------- integer :: it + integer :: it_begin,it_end ! non-dimensionalization double precision :: scale_t,scale_t_inv,scale_displ,scale_displ_inv,scale_veloc diff --git a/src/specfem3D/write_output_ASCII.f90 b/src/specfem3D/write_output_ASCII.f90 index b7299bc21..96622b2cf 100644 --- a/src/specfem3D/write_output_ASCII.f90 +++ b/src/specfem3D/write_output_ASCII.f90 @@ -37,29 +37,27 @@ subroutine write_output_ASCII(seismogram_tmp,iorientation,sisname,sisname_big_fi use specfem_par, only: & DT,t0,NSTEP, & - seismo_offset,seismo_current, NTSTEP_BETWEEN_OUTPUT_SAMPLE, & - NTSTEP_BETWEEN_OUTPUT_SEISMOS,OUTPUT_FILES,SIMULATION_TYPE, & + seismo_offset,seismo_current, & + NTSTEP_BETWEEN_OUTPUT_SAMPLE, & + nlength_seismogram, & + OUTPUT_FILES,SIMULATION_TYPE, & SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE, & myrank implicit none - real(kind=CUSTOM_REAL), dimension(5,NTSTEP_BETWEEN_OUTPUT_SEISMOS) :: seismogram_tmp + real(kind=CUSTOM_REAL), dimension(5,nlength_seismogram) :: seismogram_tmp integer :: iorientation character(len=MAX_STRING_LEN) :: sisname,sisname_big_file ! local parameters - integer :: it + integer :: it_current integer :: ier,isample - integer :: seismo_current_used - double precision :: value - double precision :: timeval + real(kind=CUSTOM_REAL) :: value,time_t + double precision :: time_t_db character(len=MAX_STRING_LEN) :: sisname_2 - ! actual seismogram length - seismo_current_used = ceiling(real(seismo_current) / NTSTEP_BETWEEN_OUTPUT_SAMPLE) - ! add .ascii extension to seismogram file name for ASCII seismograms write(sisname_2,"('/',a,'.ascii')") trim(sisname) @@ -85,28 +83,31 @@ subroutine write_output_ASCII(seismogram_tmp,iorientation,sisname,sisname_big_fi endif ! subtract half duration of the source to make sure travel time is correct - do isample = 1,seismo_current_used + do isample = 1,seismo_current ! seismogram value - value = dble(seismogram_tmp(iorientation,isample)) + value = seismogram_tmp(iorientation,isample) ! current time increment - it = seismo_offset + isample + it_current = seismo_offset + isample ! current time if (SIMULATION_TYPE == 3) then - timeval = dble((NSTEP-it)*NTSTEP_BETWEEN_OUTPUT_SAMPLE)*DT - t0 + time_t_db = dble(NSTEP - it_current * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0 else - timeval = dble((it-1)*NTSTEP_BETWEEN_OUTPUT_SAMPLE)*DT - t0 + time_t_db = dble( (it_current-1) * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0 endif + ! converts time to CUSTOM_REAL for output + time_t = real(time_t_db,kind=CUSTOM_REAL) + ! writes out to file if (SAVE_ALL_SEISMOS_IN_ONE_FILE .and. USE_BINARY_FOR_LARGE_FILE) then ! distinguish between single and double precision for reals - write(IOUT) real(timeval, kind=CUSTOM_REAL), real(value, kind=CUSTOM_REAL) + write(IOUT) time_t,value else ! distinguish between single and double precision for reals - write(IOUT,*) real(timeval, kind=CUSTOM_REAL), ' ', real(value, kind=CUSTOM_REAL) + write(IOUT,*) time_t,value endif enddo diff --git a/src/specfem3D/write_output_ASDF.f90 b/src/specfem3D/write_output_ASDF.f90 index fbe8bf419..7c784d7b3 100644 --- a/src/specfem3D/write_output_ASDF.f90 +++ b/src/specfem3D/write_output_ASDF.f90 @@ -29,8 +29,8 @@ !! \param nrec_local The number of receivers on the local processor subroutine init_asdf_data(nrec_local) - use constants, only: NDIM - use specfem_par, only: myrank,NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS + use constants, only: NDIM,myrank + use specfem_par, only: NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS use asdf_data, only: asdf_container @@ -94,10 +94,11 @@ subroutine store_asdf_data(seismogram_tmp, irec_local, irec, chn, iorientation) use constants, only: CUSTOM_REAL,NDIM,myrank use specfem_par, only: & + seismo_current, & + nlength_seismogram, & station_name,network_name, & - seismo_current, NTSTEP_BETWEEN_OUTPUT_SEISMOS, myrank, & stlat, stlon, stele, stbur, & - ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SAMPLE + ROTATE_SEISMOGRAMS_RT use asdf_data, only: asdf_container @@ -106,17 +107,13 @@ subroutine store_asdf_data(seismogram_tmp, irec_local, irec, chn, iorientation) ! Parameters character(len=4),intent(in) :: chn integer,intent(in) :: irec_local, irec, iorientation - real(kind=CUSTOM_REAL),dimension(5,NTSTEP_BETWEEN_OUTPUT_SEISMOS),intent(in) :: seismogram_tmp + real(kind=CUSTOM_REAL),dimension(5,nlength_seismogram),intent(in) :: seismogram_tmp ! local Variables integer :: length_station_name, length_network_name integer :: ier, i, index_increment - integer :: seismo_current_used - - ! actual seismogram length - seismo_current_used = ceiling(real(seismo_current) / NTSTEP_BETWEEN_OUTPUT_SAMPLE) - ! BS BS: In case that components are rotated to radial and transverse, we need to + ! In case that components are rotated to radial and transverse, we need to ! change the way the trace index is incremented below. Otherwise, the code crashes ! in close_asdf_data() when trying to deallocate the records in asdf_container if (ROTATE_SEISMOGRAMS_RT) then @@ -140,14 +137,14 @@ subroutine store_asdf_data(seismogram_tmp, irec_local, irec, chn, iorientation) asdf_container%receiver_el(irec_local) = stele(irec) asdf_container%receiver_dpt(irec_local) = stbur(irec) - allocate(asdf_container%records(i)%record(seismo_current_used),stat=ier) + allocate(asdf_container%records(i)%record(seismo_current),stat=ier) if (ier /= 0) call exit_MPI (myrank, 'Allocating ASDF container failed.') ! initializes asdf_container%records(i)%record(:) = 0.0 ! seismogram as real data - asdf_container%records(i)%record(1:seismo_current_used) = real(seismogram_tmp(iorientation, 1:seismo_current_used),kind=4) + asdf_container%records(i)%record(1:seismo_current) = real(seismogram_tmp(iorientation, 1:seismo_current),kind=4) end subroutine store_asdf_data @@ -210,7 +207,6 @@ subroutine write_asdf() integer :: num_stations integer :: stationxml_length integer :: nsamples ! constant, as in SPECFEM - integer :: seismo_current_used double precision :: sampling_rate double precision :: startTime integer(kind=8) :: start_time @@ -276,13 +272,10 @@ subroutine write_asdf() call world_duplicate(comm) call world_size(mysize) - ! actual seismogram length - seismo_current_used = ceiling(real(seismo_current) / NTSTEP_BETWEEN_OUTPUT_SAMPLE) - num_stations = asdf_container%nrec_local sampling_rate = 1.0/(DT*NTSTEP_BETWEEN_OUTPUT_SAMPLE) - ! BS BS: The total number of samples to be written to the ASDF file should be NSTEP. + ! note: The total number of samples to be written to the ASDF file should be NSTEP. ! seismo_current is equivalent to NTSTEP_BETWEEN_OUTPUT_SEISMOS (except when it==it_end), as only in this case ! the routine is called from write_seismograms() !nsamples = seismo_current * (NSTEP / NTSTEP_BETWEEN_OUTPUT_SEISMOS) @@ -440,13 +433,13 @@ subroutine write_asdf() enddo call all_gather_all_ch(networks_names, & - num_stations * MAX_LENGTH_NETWORK_NAME, & - network_names_gather, & - rcounts, & - displs, & - max_num_stations_gather, & - MAX_LENGTH_NETWORK_NAME, & - mysize) + num_stations * MAX_LENGTH_NETWORK_NAME, & + network_names_gather, & + rcounts, & + displs, & + max_num_stations_gather, & + MAX_LENGTH_NETWORK_NAME, & + mysize) do i = 1, mysize displs(i) = (i-1) * max_num_stations_gather * NDIM @@ -454,13 +447,13 @@ subroutine write_asdf() enddo call all_gather_all_ch(component_names, & - num_stations*NDIM*NDIM, & - component_names_gather, & - rcounts*NDIM, & - displs*NDIM, & - max_num_stations_gather*NDIM, & - NDIM, & - mysize) + num_stations*NDIM*NDIM, & + component_names_gather, & + rcounts*NDIM, & + displs*NDIM, & + max_num_stations_gather*NDIM, & + NDIM, & + mysize) ! Now gather all the coordiante information for these stations do i = 1, mysize @@ -503,7 +496,7 @@ subroutine write_asdf() deallocate(displs) deallocate(rcounts) - allocate(one_seismogram(NDIM,seismo_current_used),stat=ier) + allocate(one_seismogram(NDIM,seismo_current),stat=ier) if (ier /= 0) call exit_MPI(myrank,'error allocating one_seismogram array') one_seismogram(:,:) = 0.0 @@ -515,7 +508,7 @@ subroutine write_asdf() if (seismo_offset == 0) then if (myrank == 0) then ! user output - write(IMAIN,*) 'creating ASDF file: ',trim(OUTPUT_FILES) // "synthetic.h5" + write(IMAIN,*) ' seismogram ASDF file: ',trim(OUTPUT_FILES) // "synthetic.h5" call flush_IMAIN() ! initialize HDF5 @@ -669,7 +662,7 @@ subroutine write_asdf() if (myrank == 0) then do i = 1, NDIM !write(*,*) j, l, l+i, size(asdf_container%records) - one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current_used) + one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current) enddo endif @@ -678,14 +671,14 @@ subroutine write_asdf() if (myrank == current_proc) then !one_seismogram(:,:) = seismograms(:,j,:) do i = 1, NDIM - one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current_used) + one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current) enddo ! send (real) data - call send_r(one_seismogram,NDIM*seismo_current_used,receiver,itag) + call send_r(one_seismogram,NDIM*seismo_current,receiver,itag) else if (myrank == 0) then ! receive (real) data - call recv_r(one_seismogram,NDIM*seismo_current_used,sender,itag) + call recv_r(one_seismogram,NDIM*seismo_current,sender,itag) endif endif @@ -714,7 +707,7 @@ subroutine write_asdf() ! writes (float) data call ASDF_write_partial_waveform_f(data_ids(i), & - one_seismogram(i,1:seismo_current_used), seismo_offset, seismo_current_used, ier) + one_seismogram(i,1:seismo_current), seismo_offset, seismo_current, ier) if (ier /= 0) call exit_MPI(myrank,'Error ASDF write partial waveform failed') call ASDF_close_dataset_f(data_ids(i), ier) @@ -793,11 +786,11 @@ subroutine write_asdf() data_ids(i)) if (myrank == current_proc) then - one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current_used) + one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current) ! writes (float) data call ASDF_write_partial_waveform_f(data_ids(i), & - one_seismogram(i,1:seismo_current_used), seismo_offset, seismo_current_used, ier) + one_seismogram(i,1:seismo_current), seismo_offset, seismo_current, ier) if (ier /= 0) call exit_MPI(myrank,'Error ASDF parallel write partial waveform failed') endif diff --git a/src/specfem3D/write_output_SAC.f90 b/src/specfem3D/write_output_SAC.f90 index 09c86e10c..e863774f8 100644 --- a/src/specfem3D/write_output_SAC.f90 +++ b/src/specfem3D/write_output_SAC.f90 @@ -39,8 +39,9 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi) DT,t0, & seismo_offset,seismo_current,it_end, & OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_SAC_BINARY, & - NTSTEP_BETWEEN_OUTPUT_SEISMOS, & - MODEL,OUTPUT_FILES,NTSTEP_BETWEEN_OUTPUT_SAMPLE + MODEL,OUTPUT_FILES, & + NTSTEP_BETWEEN_OUTPUT_SAMPLE, & + nlength_seismogram use specfem_par, only: & yr => yr_SAC,jda => jda_SAC,ho => ho_SAC,mi => mi_SAC,sec => sec_SAC, & @@ -51,7 +52,7 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi) implicit none - real(kind=CUSTOM_REAL), dimension(5,NTSTEP_BETWEEN_OUTPUT_SEISMOS) :: seismogram_tmp + real(kind=CUSTOM_REAL), dimension(5,nlength_seismogram) :: seismogram_tmp integer :: irec integer :: iorientation @@ -63,7 +64,7 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi) ! local parameters double precision :: btime - real, dimension(NTSTEP_BETWEEN_OUTPUT_SEISMOS) :: tmp + real, dimension(nlength_seismogram) :: tmp integer :: time_sec,isample character(len=MAX_STRING_LEN) :: sisname_2 @@ -116,7 +117,6 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi) logical, external :: is_leap_year integer :: imodulo_5 - integer :: seismo_current_used !---------------------------------------------------------------- @@ -219,9 +219,6 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi) !USER2 = sngl(depth) !USER3 = sngl(cmt_hdur) !half duration from CMT if not changed to t0 = 0.d0 (point source) - ! actual seismogram length - seismo_current_used = ceiling(real(seismo_current) / NTSTEP_BETWEEN_OUTPUT_SAMPLE) - ! to avoid compiler warnings value1 = elat value1 = elon @@ -451,10 +448,10 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi) ! now write data - with five values per row: ! --------------- - imodulo_5 = mod(seismo_current_used,5) + imodulo_5 = mod(seismo_current,5) if (imodulo_5 == 0) then ! five values per row - do isample = 1+5,seismo_current_used+1,5 + do isample = 1+5,seismo_current+1,5 value1 = dble(seismogram_tmp(iorientation,isample-5)) value2 = dble(seismogram_tmp(iorientation,isample-4)) value3 = dble(seismogram_tmp(iorientation,isample-3)) @@ -464,7 +461,7 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi) enddo else ! five values per row as long as possible - do isample = 1+5,(seismo_current_used-imodulo_5)+1,5 + do isample = 1+5,(seismo_current-imodulo_5)+1,5 value1 = dble(seismogram_tmp(iorientation,isample-5)) value2 = dble(seismogram_tmp(iorientation,isample-4)) value3 = dble(seismogram_tmp(iorientation,isample-3)) @@ -473,10 +470,10 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi) write(IOUT_SAC,510) sngl(value1),sngl(value2),sngl(value3),sngl(value4),sngl(value5) enddo ! loads remaining values - if (imodulo_5 >= 1) value1 = dble(seismogram_tmp(iorientation,seismo_current_used-imodulo_5+1)) - if (imodulo_5 >= 2) value2 = dble(seismogram_tmp(iorientation,seismo_current_used-imodulo_5+2)) - if (imodulo_5 >= 3) value3 = dble(seismogram_tmp(iorientation,seismo_current_used-imodulo_5+3)) - if (imodulo_5 >= 4) value4 = dble(seismogram_tmp(iorientation,seismo_current_used-imodulo_5+4)) + if (imodulo_5 >= 1) value1 = dble(seismogram_tmp(iorientation,seismo_current-imodulo_5+1)) + if (imodulo_5 >= 2) value2 = dble(seismogram_tmp(iorientation,seismo_current-imodulo_5+2)) + if (imodulo_5 >= 3) value3 = dble(seismogram_tmp(iorientation,seismo_current-imodulo_5+3)) + if (imodulo_5 >= 4) value4 = dble(seismogram_tmp(iorientation,seismo_current-imodulo_5+4)) ! writes out last data line select case(imodulo_5) case (1) @@ -626,7 +623,6 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi) call write_integer(LCALDA) !(109) call write_integer(int(UNUSED)) !(110) - ! write character header variables 111:302 call write_character(KSTNM,8) !(111:118) call write_character(KEVNM,16) !(119:134) @@ -655,10 +651,10 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi) endif ! now write SAC time series to file - ! BS BS write whole time series at once (hope to increase I/O performance - ! compared to using a loop on it) - tmp(1:seismo_current_used) = real(seismogram_tmp(iorientation,1:seismo_current_used)) - call write_n_real(tmp(1:seismo_current_used),seismo_current_used) + ! write whole time series at once (hope to increase I/O performance compared to using a loop on it) + tmp(1:seismo_current) = real(seismogram_tmp(iorientation,1:seismo_current)) + + call write_n_real(tmp(1:seismo_current),seismo_current) call close_file() diff --git a/src/specfem3D/write_seismograms.f90 b/src/specfem3D/write_seismograms.f90 index 4f7e7d52e..f42ecbbc5 100644 --- a/src/specfem3D/write_seismograms.f90 +++ b/src/specfem3D/write_seismograms.f90 @@ -33,10 +33,16 @@ subroutine write_seismograms() use specfem_par, only: myrank,Mesh_pointer,GPU_MODE,GPU_ASYNC_COPY,SIMULATION_TYPE, & nrec_local,number_receiver_global,ispec_selected_rec,ispec_selected_source, & - it,it_begin,it_end,seismo_current,seismo_offset, seismograms,NTSTEP_BETWEEN_OUTPUT_SEISMOS, & + it,it_end, & + seismo_current,seismo_offset, & + seismograms, & + nlength_seismogram, & + NTSTEP_BETWEEN_OUTPUT_SEISMOS,NTSTEP_BETWEEN_OUTPUT_SAMPLE, & + do_save_seismograms, & WRITE_SEISMOGRAMS_BY_MAIN,OUTPUT_SEISMOS_ASDF, & SAVE_SEISMOGRAMS_IN_ADJOINT_RUN,SAVE_SEISMOGRAMS_STRAIN, & - it_adj_written,moment_der,sloc_der,shdur_der,stshift_der,scale_displ,NSTEP + moment_der,sloc_der,shdur_der,stshift_der, & + scale_displ use specfem_par_crustmantle, only: displ_crust_mantle,b_displ_crust_mantle, & eps_trace_over_3_crust_mantle,epsilondev_xx_crust_mantle,epsilondev_xy_crust_mantle,epsilondev_xz_crust_mantle, & @@ -50,63 +56,68 @@ subroutine write_seismograms() double precision, external :: wtime double precision :: write_time_begin,write_time - ! update position in seismograms - seismo_current = seismo_current + 1 - - ! compute & store the seismograms only if there is at least one receiver located in this slice - if (nrec_local > 0) then - - ! gets resulting array values onto CPU - if (GPU_MODE) then - ! for forward and kernel simulations, seismograms are computed by the GPU, thus no need to transfer the wavefield - ! gets field values from GPU - if (SIMULATION_TYPE == 2 .or. SAVE_SEISMOGRAMS_STRAIN) then - ! this transfers fields only in elements with stations for efficiency - call write_seismograms_transfer_gpu(Mesh_pointer, & - displ_crust_mantle,b_displ_crust_mantle, & - eps_trace_over_3_crust_mantle, & - epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, & - epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, & - number_receiver_global, & - ispec_selected_rec,ispec_selected_source, & - ibool_crust_mantle) - ! synchronizes field values from GPU - if (GPU_ASYNC_COPY) then - call transfer_seismo_from_device_async(Mesh_pointer, & - displ_crust_mantle,b_displ_crust_mantle, & - number_receiver_global,ispec_selected_rec,ispec_selected_source, & - ibool_crust_mantle) + ! checks if anything to do + if (.not. do_save_seismograms) return + + ! checks subsampling recurrence + if (mod(it-1,NTSTEP_BETWEEN_OUTPUT_SAMPLE) == 0) then + + ! update position in seismograms + seismo_current = seismo_current + 1 + + ! check for edge effects + if (seismo_current < 1 .or. seismo_current > nlength_seismogram) & + call exit_mpi(myrank,'Error: seismo_current out of bounds in recording of seismograms') + + ! compute & store the seismograms only if there is at least one receiver located in this slice + if (nrec_local > 0) then + ! gets resulting array values onto CPU + if (GPU_MODE) then + ! for forward and kernel simulations, seismograms are computed by the GPU, thus no need to transfer the wavefield + ! gets field values from GPU + if (SIMULATION_TYPE == 2 .or. SAVE_SEISMOGRAMS_STRAIN) then + ! this transfers fields only in elements with stations for efficiency + call write_seismograms_transfer_gpu(Mesh_pointer, & + displ_crust_mantle,b_displ_crust_mantle, & + eps_trace_over_3_crust_mantle, & + epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, & + epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, & + number_receiver_global, & + ispec_selected_rec,ispec_selected_source, & + ibool_crust_mantle) + ! synchronizes field values from GPU + if (GPU_ASYNC_COPY) then + call transfer_seismo_from_device_async(Mesh_pointer, & + displ_crust_mantle,b_displ_crust_mantle, & + number_receiver_global,ispec_selected_rec,ispec_selected_source, & + ibool_crust_mantle) + endif endif - endif - endif ! GPU_MODE + endif ! GPU_MODE - ! computes traces at interpolated receiver locations - select case (SIMULATION_TYPE) - case (1) - ! forward run - if (.not. GPU_MODE) then - ! on CPU - call compute_seismograms(NGLOB_CRUST_MANTLE,displ_crust_mantle,seismo_current,seismograms) - else - ! on GPU - call compute_seismograms_gpu(Mesh_pointer,seismograms,seismo_current,it,it_end, & - scale_displ,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP) - - endif - case (2) - ! adjoint run - call compute_seismograms_adjoint(displ_crust_mantle, & - eps_trace_over_3_crust_mantle, & - epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, & - epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, & - it_adj_written, & - moment_der,sloc_der,stshift_der,shdur_der, & - seismograms) - case (3) - ! kernel run - if (.not. GPU_MODE) then - ! on CPU - if (.not. ( SIMULATION_TYPE == 3 .and. (.not. SAVE_SEISMOGRAMS_IN_ADJOINT_RUN)) ) then + ! computes traces at interpolated receiver locations + select case (SIMULATION_TYPE) + case (1) + ! forward run + if (.not. GPU_MODE) then + ! on CPU + call compute_seismograms(NGLOB_CRUST_MANTLE,displ_crust_mantle,seismo_current,seismograms) + else + ! on GPU + call compute_seismograms_gpu(Mesh_pointer,seismograms,seismo_current,it,it_end,scale_displ,nlength_seismogram) + endif + case (2) + ! adjoint run + call compute_seismograms_adjoint(displ_crust_mantle, & + eps_trace_over_3_crust_mantle, & + epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, & + epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, & + moment_der,sloc_der,stshift_der,shdur_der, & + seismograms) + case (3) + ! kernel run + if (.not. GPU_MODE) then + ! on CPU if (OUTPUT_ADJOINT_WAVEFIELD_SEISMOGRAMS) then ! uncomment to output adjoint wavefield instead for seismogram output call compute_seismograms(NGLOB_CRUST_MANTLE_ADJOINT,displ_crust_mantle,seismo_current,seismograms) @@ -114,32 +125,43 @@ subroutine write_seismograms() ! default, backward reconstructed wavefield seismos call compute_seismograms(NGLOB_CRUST_MANTLE_ADJOINT,b_displ_crust_mantle,seismo_current,seismograms) endif + else + ! on GPU + call compute_seismograms_gpu(Mesh_pointer,seismograms,seismo_current,it,it_end,scale_displ,nlength_seismogram) endif - else - ! on GPU - if (.not. ( SIMULATION_TYPE == 3 .and. (.not. SAVE_SEISMOGRAMS_IN_ADJOINT_RUN)) ) then - call compute_seismograms_gpu(Mesh_pointer,seismograms,seismo_current,it,it_end, & - scale_displ,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP) - endif - endif - end select - - ! strain seismograms - if (SAVE_SEISMOGRAMS_STRAIN) then - select case (SIMULATION_TYPE) - case (1) - ! forward run - call compute_seismograms_strain(NGLOB_CRUST_MANTLE,displ_crust_mantle) - case (3) - ! kernel run - call compute_seismograms_strain(NGLOB_CRUST_MANTLE,b_displ_crust_mantle) end select - endif - endif ! nrec_local + ! strain seismograms + if (SAVE_SEISMOGRAMS_STRAIN) then + select case (SIMULATION_TYPE) + case (1) + ! forward run + call compute_seismograms_strain(NGLOB_CRUST_MANTLE,displ_crust_mantle) + case (3) + ! kernel run + call compute_seismograms_strain(NGLOB_CRUST_MANTLE,b_displ_crust_mantle) + end select + endif + + endif ! nrec_local + endif ! write the current or final seismograms - if (seismo_current == NTSTEP_BETWEEN_OUTPUT_SEISMOS .or. it == it_end) then + ! + ! for example: + ! NTSTEP_BETWEEN_OUTPUT_SEISMOS = 10 , NTSTEP_BETWEEN_OUTPUT_SAMPLE = 5 + ! -> 10 / 5 == 2 steps (nlength_seismogram) + ! we will store samples at it==1,it==6 and output at it==10 + ! NTSTEP_BETWEEN_OUTPUT_SEISMOS = 10 , NTSTEP_BETWEEN_OUTPUT_SAMPLE = 1 + ! -> 10 / 1 == 10 steps + ! we will store samples at it==1,2,3,..,10 and output at it==10 + ! + ! that is for NTSTEP_BETWEEN_OUTPUT_SEISMOS = 10 , NTSTEP_BETWEEN_OUTPUT_SAMPLE = 5, + ! the seismograms have samples for the wavefield at mod((it-1),NTSTEP_BETWEEN_OUTPUT_SAMPLE) == 0, + ! which is at it==1,it==6. for writing the seismograms however, + ! we would write out at mod(it,NTSTEP_BETWEEN_OUTPUT_SEISMOS) == 0, which is at it==10 + ! + if (mod(it,NTSTEP_BETWEEN_OUTPUT_SEISMOS) == 0 .or. it == it_end) then ! timing write_time_begin = wtime() @@ -158,7 +180,6 @@ subroutine write_seismograms() case (2) ! adjoint wavefield call write_adj_seismograms() - it_adj_written = it end select endif @@ -174,13 +195,11 @@ subroutine write_seismograms() ! timing write_time = wtime() - write_time_begin ! output - write(IMAIN,*) - write(IMAIN,*) 'Total number of time steps written: ', it-it_begin+1 - write(IMAIN,*) + write(IMAIN,*) 'Total number of time steps written: ', seismo_offset if (WRITE_SEISMOGRAMS_BY_MAIN) then - write(IMAIN,*) 'Writing the seismograms by main proc alone took ',write_time,' seconds' + write(IMAIN,*) 'Writing the seismograms by main proc alone took ',sngl(write_time),' seconds' else - write(IMAIN,*) 'Writing the seismograms in parallel took ',write_time,' seconds' + write(IMAIN,*) 'Writing the seismograms in parallel took ',sngl(write_time),' seconds' endif write(IMAIN,*) call flush_IMAIN() @@ -199,20 +218,21 @@ subroutine write_seismograms_to_file() use constants_solver, only: MAX_STRING_LEN,CUSTOM_REAL,NDIM,IMAIN,IOUT,itag use specfem_par, only: & - NPROCTOT_VAL,myrank,nrec,nrec_local, & - number_receiver_global,seismograms, & - islice_num_rec_local, & - seismo_offset,seismo_current, & - station_name,network_name, & - OUTPUT_SEISMOS_ASCII_TEXT, & - OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_SAC_BINARY, & - OUTPUT_SEISMOS_ASDF, & - OUTPUT_SEISMOS_3D_ARRAY, & - NTSTEP_BETWEEN_OUTPUT_SEISMOS, & - SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE, & - OUTPUT_FILES, & - WRITE_SEISMOGRAMS_BY_MAIN, & - DT,NSTEP,NTSTEP_BETWEEN_OUTPUT_SAMPLE + NPROCTOT_VAL,myrank,nrec,nrec_local, & + number_receiver_global, & + seismograms, & + nlength_seismogram, & + seismo_offset,seismo_current, & + islice_num_rec_local, & + station_name,network_name, & + OUTPUT_SEISMOS_ASCII_TEXT, & + OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_SAC_BINARY, & + OUTPUT_SEISMOS_ASDF, & + OUTPUT_SEISMOS_3D_ARRAY, & + SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE, & + OUTPUT_FILES, & + WRITE_SEISMOGRAMS_BY_MAIN, & + DT,NSTEP,NTSTEP_BETWEEN_OUTPUT_SAMPLE implicit none @@ -225,7 +245,7 @@ subroutine write_seismograms_to_file() character(len=MAX_STRING_LEN) :: sisname,staname ! allocates single station seismogram - allocate(one_seismogram(NDIM,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier) + allocate(one_seismogram(NDIM,nlength_seismogram),stat=ier) if (ier /= 0) call exit_mpi(myrank,'Error while allocating one temporary seismogram') one_seismogram(:,:) = 0.0_CUSTOM_REAL @@ -237,7 +257,7 @@ subroutine write_seismograms_to_file() ! get global number of that receiver irec = number_receiver_global(irec_local) - one_seismogram(:,:) = seismograms(:,irec_local,::NTSTEP_BETWEEN_OUTPUT_SAMPLE) + one_seismogram(:,:) = seismograms(:,irec_local,:) ! write this seismogram ! note: ASDF data structure is given in module @@ -342,7 +362,7 @@ subroutine write_seismograms_to_file() ! get global number of that receiver irec = number_receiver_global(irec_local) - one_seismogram(:,:) = seismograms(:,irec_local,::NTSTEP_BETWEEN_OUTPUT_SAMPLE) + one_seismogram(:,:) = seismograms(:,irec_local,:) ! write this seismogram ! note: ASDF data structure is given in module @@ -391,40 +411,44 @@ subroutine write_seismograms_to_file() ! loop on all the slices do iproc = 0,NPROCTOT_VAL-1 - ! communicates only with processes that contain local receivers (to minimize MPI chatter) - if (islice_num_rec_local(iproc) == 0 ) cycle - - ! receive except from proc 0, which is me and therefore I already have this value - sender = iproc - if (iproc == 0) then - ! main is current slice - nrec_local_received = nrec_local - else - ! receives info from secondary processes - call recv_singlei(nrec_local_received,sender,itag) - if (nrec_local_received <= 0) call exit_MPI(myrank,'Error while receiving local number of receivers') - endif - if (nrec_local_received > 0) then - do irec_local = 1,nrec_local_received - ! receive except from proc 0, which is myself and therefore I already have these values - if (iproc == 0) then - ! get global number of that receiver - irec = number_receiver_global(irec_local) - one_seismogram(:,:) = seismograms(:,irec_local,::NTSTEP_BETWEEN_OUTPUT_SAMPLE) - else - ! receives info from secondary processes - call recv_singlei(irec,sender,itag) - if (irec < 1 .or. irec > nrec) call exit_MPI(myrank,'Error while receiving global receiver number') - call recv_cr(one_seismogram,NDIM*seismo_current,sender,itag) - endif - - ! write this seismogram - call write_one_seismogram(one_seismogram,irec,irec_local,.false.) - - ! counts seismos written - total_seismos = total_seismos + 1 - enddo - endif + ! communicates only with processes that contain local receivers (to minimize MPI chatter) + if (islice_num_rec_local(iproc) == 0 ) cycle + + ! receive except from proc 0, which is me and therefore I already have this value + sender = iproc + if (iproc == 0) then + ! main is current slice + nrec_local_received = nrec_local + else + ! receives info from secondary processes + call recv_singlei(nrec_local_received,sender,itag) + if (nrec_local_received <= 0) call exit_MPI(myrank,'Error while receiving local number of receivers') + endif + + if (nrec_local_received > 0) then + do irec_local = 1,nrec_local_received + ! init trace + one_seismogram(:,:) = 0._CUSTOM_REAL + + ! receive except from proc 0, which is myself and therefore I already have these values + if (iproc == 0) then + ! get global number of that receiver + irec = number_receiver_global(irec_local) + one_seismogram(:,:) = seismograms(:,irec_local,:) + else + ! receives info from secondary processes + call recv_singlei(irec,sender,itag) + if (irec < 1 .or. irec > nrec) call exit_MPI(myrank,'Error while receiving global receiver number') + call recv_cr(one_seismogram,NDIM*seismo_current,sender,itag) + endif + + ! write this seismogram + call write_one_seismogram(one_seismogram,irec,irec_local,.false.) + + ! counts seismos written + total_seismos = total_seismos + 1 + enddo + endif enddo else @@ -438,7 +462,7 @@ subroutine write_seismograms_to_file() irec = number_receiver_global(irec_local) call send_singlei(irec,receiver,itag) - one_seismogram(:,:) = seismograms(:,irec_local,::NTSTEP_BETWEEN_OUTPUT_SAMPLE) + one_seismogram(:,:) = seismograms(:,irec_local,:) call send_cr(one_seismogram,NDIM*seismo_current,receiver,itag) enddo endif @@ -447,10 +471,10 @@ subroutine write_seismograms_to_file() ! only main process if (myrank == 0) then ! output info - write(IMAIN,*) write(IMAIN,*) 'Total number of receivers saved is ',total_seismos,' out of ',nrec - write(IMAIN,*) + call flush_IMAIN() + ! checks if (total_seismos /= nrec) call exit_MPI(myrank,'incorrect total number of receivers saved') ! create one large file instead of one small file per station to avoid file system overload @@ -475,25 +499,25 @@ subroutine write_one_seismogram(one_seismogram,irec,irec_local,is_for_asdf) MAX_LENGTH_STATION_NAME,MAX_LENGTH_NETWORK_NAME use specfem_par, only: & - myrank, & - station_name,network_name,stlat,stlon, & - DT, & - seismo_current, & - OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_ASDF, & - OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS, & - NTSTEP_BETWEEN_OUTPUT_SAMPLE + myrank, & + station_name,network_name,stlat,stlon, & + DT, & + seismo_current, & + OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_ASDF, & + OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT, & + nlength_seismogram use specfem_par, only: & - cmt_lat => cmt_lat_SAC,cmt_lon => cmt_lon_SAC + cmt_lat => cmt_lat_SAC,cmt_lon => cmt_lon_SAC implicit none integer,intent(in) :: irec,irec_local logical,intent(in) :: is_for_asdf - real(kind=CUSTOM_REAL), dimension(NDIM,NTSTEP_BETWEEN_OUTPUT_SEISMOS),intent(in) :: one_seismogram + real(kind=CUSTOM_REAL), dimension(NDIM,nlength_seismogram),intent(in) :: one_seismogram ! local parameters - real(kind=CUSTOM_REAL), dimension(5,NTSTEP_BETWEEN_OUTPUT_SEISMOS) :: seismogram_tmp + real(kind=CUSTOM_REAL), dimension(5,nlength_seismogram) :: seismogram_tmp integer :: iorientation,length_station_name,length_network_name character(len=4) :: chn @@ -507,11 +531,9 @@ subroutine write_one_seismogram(one_seismogram,irec,irec_local,is_for_asdf) double precision :: phi real(kind=CUSTOM_REAL) :: cphi,sphi integer :: isample - integer :: seismo_current_used ! initializes seismogram_tmp(:,:) = 0.0_CUSTOM_REAL - seismo_current_used = ceiling(real(seismo_current) / NTSTEP_BETWEEN_OUTPUT_SAMPLE) ! get band code call band_instrument_code(DT,bic) @@ -524,7 +546,7 @@ subroutine write_one_seismogram(one_seismogram,irec,irec_local,is_for_asdf) ior_end = 3 ! ending with Z => NEZ endif - do iorientation = ior_start,ior_end ! BS BS changed according to ROTATE_SEISMOGRAMS_RT + do iorientation = ior_start,ior_end ! changes according to ROTATE_SEISMOGRAMS_RT select case (iorientation) case (1) @@ -546,9 +568,8 @@ subroutine write_one_seismogram(one_seismogram,irec,irec_local,is_for_asdf) call exit_MPI(myrank,'incorrect channel value') end select - if (iorientation == 4 .or. iorientation == 5) then ! LMU BS BS - - ! BS BS calculate backazimuth needed to rotate East and North + if (iorientation == 4 .or. iorientation == 5) then + ! calculate backazimuth needed to rotate East and North ! components to Radial and Transverse components ! (back-azimuth returned in degrees between [0,360]) call get_backazimuth(cmt_lat,cmt_lon,stlat(irec),stlon(irec),backaz) @@ -571,25 +592,24 @@ subroutine write_one_seismogram(one_seismogram,irec,irec_local,is_for_asdf) cphi = cos(phi*DEGREES_TO_RADIANS) sphi = sin(phi*DEGREES_TO_RADIANS) - ! BS BS do the rotation of the components and put result in + ! do the rotation of the components and put result in ! new variable seismogram_tmp if (iorientation == 4) then ! radial component - do isample = 1,seismo_current_used - seismogram_tmp(iorientation,isample) = & + do isample = 1,seismo_current + seismogram_tmp(iorientation,isample) = & cphi * one_seismogram(1,isample) + sphi * one_seismogram(2,isample) - enddo + enddo else if (iorientation == 5) then ! transverse component - do isample = 1,seismo_current_used - seismogram_tmp(iorientation,isample) = & + do isample = 1,seismo_current + seismogram_tmp(iorientation,isample) = & -1*sphi * one_seismogram(1,isample) + cphi * one_seismogram(2,isample) - enddo + enddo endif - - else ! keep NEZ components - do isample = 1,seismo_current_used + else + ! keep NEZ components + do isample = 1,seismo_current seismogram_tmp(iorientation,isample) = one_seismogram(iorientation,isample) enddo - endif ! create the name of the seismogram file for each slice @@ -644,17 +664,17 @@ subroutine write_adj_seismograms() use constants, only: MAX_STRING_LEN,CUSTOM_REAL,IOUT - use specfem_par, only: NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NTSTEP_BETWEEN_OUTPUT_SAMPLE, & + use specfem_par, only: NTSTEP_BETWEEN_OUTPUT_SEISMOS,NTSTEP_BETWEEN_OUTPUT_SAMPLE, & DT,t0,OUTPUT_FILES, & seismograms,number_receiver_global,nrec_local, & - it,it_adj_written, & + it,seismo_current,seismo_offset, & myrank,WRITE_SEISMOGRAMS_BY_MAIN implicit none ! local parameters integer :: irec,irec_local,ier - integer :: iorientation,isample + integer :: iorientation,isample,it_tmp real(kind=CUSTOM_REAL) :: time_t character(len=4) :: chn @@ -719,13 +739,17 @@ subroutine write_adj_seismograms() if (ier /= 0) call exit_mpi(myrank,'Error opening file: '//trim(OUTPUT_FILES)//trim(sisname)) ! make sure we never write more than the maximum number of time steps - do isample = it_adj_written+1,min(it,NSTEP) - ! time + do isample = 1,seismo_current + ! current time + ! current time increment + it_tmp = seismo_offset + isample + ! subtract onset time to make sure travel time is correct ! distinguish between single and double precision for reals - time_t = real(dble(isample-1)*DT*NTSTEP_BETWEEN_OUTPUT_SAMPLE - t0,kind=CUSTOM_REAL) + time_t = real(dble((it_tmp-1) * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0,kind=CUSTOM_REAL) + ! output - write(IOUT,*) time_t,seismograms(iorientation,irec_local,isample-it_adj_written) + write(IOUT,*) time_t,seismograms(iorientation,irec_local,isample) enddo close(IOUT)