diff --git a/.travis.yml b/.travis.yml
index 8623d5a68..878ef72b6 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -209,7 +209,7 @@ after_success:
   - |
     if [ "$TESTCOV" == "1" ]; then
       gcov --version
-      echo `pwd`
+      echo
       ls -al obj/
     fi
 
diff --git a/.travis/run_install.sh b/.travis/run_install.sh
index 0dc2547da..ca59cd5ad 100644
--- a/.travis/run_install.sh
+++ b/.travis/run_install.sh
@@ -185,6 +185,7 @@ echo "export OMPI_MCA_btl_vader_single_copy_mechanism=none" >> $HOME/.tmprc
 echo "export OMPI_MCA_btl=^openib" >> $HOME/.tmprc
 
 echo ""
-echo "exports:"
-export
+# commented this out due to travis reporting security issues in these outputs
+#echo "exports:"
+#export
 echo ""
diff --git a/.travis/run_tests.sh b/.travis/run_tests.sh
index 8984ed40b..9f579072c 100644
--- a/.travis/run_tests.sh
+++ b/.travis/run_tests.sh
@@ -37,7 +37,9 @@ esac
 
 # info
 #echo $TRAVIS_BUILD_DIR
-echo $WORKDIR
+# commented this out due to travis reporting security issues in these outputs
+#echo $WORKDIR
+
 echo `date`
 echo
 echo "**********************************************************"
@@ -137,7 +139,7 @@ else
 
   # simulation done
   echo
-  echo "simulation done: `pwd`"
+  echo "simulation done: $dir"
   echo `date`
   echo
 
@@ -152,7 +154,7 @@ if [[ $? -ne 0 ]]; then exit 1; fi
 
 # simulation done
 echo
-echo "test done: `pwd`"
+echo "test done: $dir"
 echo `date`
 echo
 
diff --git a/Makefile.in b/Makefile.in
index a8432819b..8eec80b8c 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -234,11 +234,11 @@ GENCODE_AMD_MI250 = --amdgpu-target=gfx90a
 @COND_HIP_TRUE@@COND_HIP_PLATFORM_NVIDIA_TRUE@HIP_CFLAG_ENDING = -x cu    # CUDA compilation or src/gpu/*.c files
 
 # specific targets
-@COND_HIP_TRUE@@COND_HIP_MI8_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI8)      # --with-hip=MI8 ..
-@COND_HIP_TRUE@@COND_HIP_MI25_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI25)    # --with-hip=MI25 ..
-@COND_HIP_TRUE@@COND_HIP_MI50_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI50)    # --with-hip=MI50 ..
-@COND_HIP_TRUE@@COND_HIP_MI100_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI100)  # --with-hip=MI100 ..
-@COND_HIP_TRUE@@COND_HIP_MI250_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI250)  # --with-hip=MI250 ..
+@COND_HIP_TRUE@@COND_HIP_MI8_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI8) $(FC_DEFINE)GPU_DEVICE_MI8         # --with-hip=MI8 ..
+@COND_HIP_TRUE@@COND_HIP_MI25_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI25) $(FC_DEFINE)GPU_DEVICE_MI25      # --with-hip=MI25 ..
+@COND_HIP_TRUE@@COND_HIP_MI50_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI50) $(FC_DEFINE)GPU_DEVICE_MI50      # --with-hip=MI50 ..
+@COND_HIP_TRUE@@COND_HIP_MI100_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI100) $(FC_DEFINE)GPU_DEVICE_MI100   # --with-hip=MI100 ..
+@COND_HIP_TRUE@@COND_HIP_MI250_TRUE@GENCODE_HIP = $(GENCODE_AMD_MI250) $(FC_DEFINE)GPU_DEVICE_MI250   # --with-hip=MI250 ..
 
 @COND_HIP_TRUE@@COND_HIP_CUDA5_TRUE@GENCODE_HIP = $(GENCODE_35)         # --with-hip=cuda5 ..
 @COND_HIP_TRUE@@COND_HIP_CUDA6_TRUE@GENCODE_HIP = $(GENCODE_37)         # --with-hip=cuda6 ..
diff --git a/src/gpu/boast/compute_add_sources_kernel.rb b/src/gpu/boast/compute_add_sources_kernel.rb
index 9eb82ece9..150d4bcc3 100644
--- a/src/gpu/boast/compute_add_sources_kernel.rb
+++ b/src/gpu/boast/compute_add_sources_kernel.rb
@@ -3,18 +3,20 @@ def BOAST::compute_add_sources_kernel( ref = true, n_dim = 3, n_gllx = 5 )
     push_env( :array_start => 0 )
     kernel = CKernel::new
     function_name = "compute_add_sources_kernel"
-    accel =                  Real("accel",                            :dir => :inout,:dim => [ Dim() ] )
-    ibool =                  Int("ibool",                             :dir => :in,   :dim => [ Dim() ] )
-    sourcearrays =           Real("sourcearrays",                     :dir => :in,   :dim => [ Dim() ] )
-    stf_pre_compute =        Real("stf_pre_compute",      :size => 8, :dir => :in,   :dim => [ Dim() ] )
-    myrank =                 Int("myrank",                            :dir => :in)
-    islice_selected_source = Int("islice_selected_source",            :dir => :in,   :dim => [ Dim() ] )
-    ispec_selected_source =  Int("ispec_selected_source",             :dir => :in,   :dim => [ Dim() ] )
-    nsources =               Int("NSOURCES",                          :dir => :in)
+    accel =                       Real("accel",                            :dir => :inout,:dim => [ Dim() ] )
+    ibool =                       Int("ibool",                             :dir => :in,   :dim => [ Dim() ] )
+    sourcearrays_local =          Real("sourcearrays_local",               :dir => :in,   :dim => [ Dim() ] )
+    stf_local =                   Real("stf_local",                        :dir => :in,   :dim => [ Dim() ] )
+    ispec_selected_source_local = Int("ispec_selected_source_local",       :dir => :in,   :dim => [ Dim() ] )
+    nsources_local =              Int("nsources_local",                    :dir => :in)
+    nstep =                       Int("NSTEP",                             :dir => :in)
+    it =                          Int("it",                                :dir => :in)
+    istage =                      Int("istage",                            :dir => :in)
 
     ndim =                   Int("NDIM",                  :const => n_dim)
     ngllx =                  Int("NGLLX",                 :const => n_gllx)
-    p = Procedure(function_name, [accel,ibool,sourcearrays,stf_pre_compute,myrank,islice_selected_source,ispec_selected_source,nsources])
+
+    p = Procedure(function_name, [accel,ibool,sourcearrays_local,stf_local,ispec_selected_source_local,nsources_local,nstep,it,istage])
     if (get_lang == CUDA and ref) then
       get_output.print File::read("references/#{function_name}.cu")
     elsif(get_lang == CUDA or get_lang == CL or get_lang == HIP) then
@@ -39,14 +41,12 @@ def BOAST::compute_add_sources_kernel( ref = true, n_dim = 3, n_gllx = 5 )
       print k === get_local_id(2)
       print isource === get_group_id(0) + get_num_groups(0)*get_group_id(1)
 
-      print If(isource < nsources) {
-        print If(myrank == islice_selected_source[isource]) {
-          print ispec === ispec_selected_source[isource] - 1
-          print stf === stf_pre_compute[isource]
-          print iglob === ibool[INDEX4(ngllx,ngllx,ngllx,i,j,k,ispec)] - 1
-          (0..2).each { |indx|
-            print atomicAdd(accel+iglob*3+indx, sourcearrays[INDEX5(ndim,ngllx,ngllx,ngllx,indx,i,j,k,isource)]*stf)
-          }
+      print If(isource < nsources_local) {
+        print ispec === ispec_selected_source_local[isource] - 1
+        print stf === stf_local[INDEX3(nsources_local,nstep,isource,it,istage)]
+        print iglob === ibool[INDEX4(ngllx,ngllx,ngllx,i,j,k,ispec)] - 1
+        (0..2).each { |indx|
+          print atomicAdd(accel+iglob*3+indx, sourcearrays_local[INDEX5(ndim,ngllx,ngllx,ngllx,indx,i,j,k,isource)]*stf)
         }
       }
       close p
@@ -57,4 +57,66 @@ def BOAST::compute_add_sources_kernel( ref = true, n_dim = 3, n_gllx = 5 )
     kernel.procedure = p
     return kernel
   end
+
+# obsolete kernel, left here for reference...
+#
+#  def BOAST::compute_add_sources_kernel( ref = true, n_dim = 3, n_gllx = 5 )
+#    push_env( :array_start => 0 )
+#    kernel = CKernel::new
+#    function_name = "compute_add_sources_kernel"
+#    accel =                  Real("accel",                            :dir => :inout,:dim => [ Dim() ] )
+#    ibool =                  Int("ibool",                             :dir => :in,   :dim => [ Dim() ] )
+#    sourcearrays =           Real("sourcearrays",                     :dir => :in,   :dim => [ Dim() ] )
+#    stf_pre_compute =        Real("stf_pre_compute",      :size => 8, :dir => :in,   :dim => [ Dim() ] )
+#    myrank =                 Int("myrank",                            :dir => :in)
+#    islice_selected_source = Int("islice_selected_source",            :dir => :in,   :dim => [ Dim() ] )
+#    ispec_selected_source =  Int("ispec_selected_source",             :dir => :in,   :dim => [ Dim() ] )
+#    nsources =               Int("NSOURCES",                          :dir => :in)
+#
+#    ndim =                   Int("NDIM",                  :const => n_dim)
+#    ngllx =                  Int("NGLLX",                 :const => n_gllx)
+#    p = Procedure(function_name, [accel,ibool,sourcearrays,stf_pre_compute,myrank,islice_selected_source,ispec_selected_source,nsources])
+#    if (get_lang == CUDA and ref) then
+#      get_output.print File::read("references/#{function_name}.cu")
+#    elsif(get_lang == CUDA or get_lang == CL or get_lang == HIP) then
+#      make_specfem3d_header( :ndim => n_dim, :ngllx => n_gllx, :double => true )
+#      open p
+#      ispec =   Int( "ispec")
+#      iglob =   Int( "iglob")
+#      stf =     Real("stf")
+#      isource = Int( "isource")
+#      i =       Int( "i")
+#      j =       Int( "j")
+#      k =       Int( "k")
+#      decl ispec
+#      decl iglob
+#      decl stf
+#      decl isource
+#      decl i
+#      decl j
+#      decl k
+#      print i === get_local_id(0)
+#      print j === get_local_id(1)
+#      print k === get_local_id(2)
+#      print isource === get_group_id(0) + get_num_groups(0)*get_group_id(1)
+#
+#      print If(isource < nsources) {
+#        print If(myrank == islice_selected_source[isource]) {
+#          print ispec === ispec_selected_source[isource] - 1
+#          print stf === stf_pre_compute[isource]
+#          print iglob === ibool[INDEX4(ngllx,ngllx,ngllx,i,j,k,ispec)] - 1
+#          (0..2).each { |indx|
+#            print atomicAdd(accel+iglob*3+indx, sourcearrays[INDEX5(ndim,ngllx,ngllx,ngllx,indx,i,j,k,isource)]*stf)
+#          }
+#        }
+#      }
+#      close p
+#    else
+#      raise "Unsupported language!"
+#    end
+#    pop_env(:array_start)
+#    kernel.procedure = p
+#    return kernel
+#  end
+
 end
diff --git a/src/gpu/boast/compute_seismograms_kernel.rb b/src/gpu/boast/compute_seismograms_kernel.rb
index 1259dff3b..a03463d66 100644
--- a/src/gpu/boast/compute_seismograms_kernel.rb
+++ b/src/gpu/boast/compute_seismograms_kernel.rb
@@ -1,5 +1,5 @@
 module BOAST
-  def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_gll3_padded = 128)
+  def BOAST::compute_seismograms_kernel(ref = false, n_dim = 3, n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_gll3_padded = 128)
     push_env( :array_start => 0 )
     kernel = CKernel::new
     v = []
@@ -15,7 +15,9 @@ def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_g
     v.push ispec_selected_rec     = Int("ispec_selected_rec",     :dir => :in,   :dim => [ Dim() ])
     v.push number_receiver_global = Int("number_receiver_global", :dir => :in,   :dim => [ Dim() ])
     v.push scale_displ            = Real("scale_displ",           :dir => :in)
+    v.push seismo_current         = Int("seismo_current",         :dir => :in)
 
+    ndim         = Int("NDIM",         :const => n_dim)
     ngllx        = Int("NGLLX",        :const => n_gllx)
     ngll2        = Int("NGLL2",        :const => n_gll2)
     ngll3        = Int("NGLL3",        :const => n_gll3)
@@ -23,7 +25,7 @@ def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_g
 
     p = Procedure(function_name, v )
     if(get_lang == CL or get_lang == CUDA or get_lang == HIP) then
-      make_specfem3d_header( :ngllx => n_gllx, :ngll2 => n_gll2, :ngll3 => n_gll3, :ngll3_padded => n_gll3_padded )
+      make_specfem3d_header( :ndim => n_dim, :ngllx => n_gllx, :ngll2 => n_gll2, :ngll3 => n_gll3, :ngll3_padded => n_gll3_padded )
       open p
       ispec =      Int("ispec")
       iglob =      Int("iglob")
@@ -36,6 +38,7 @@ def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_g
       k =          Int("k")
       l =          Int("l")
       s =          Int("s")
+      idx =        Int("idx")
       decl ispec
       decl iglob
       decl irec_local
@@ -47,6 +50,7 @@ def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_g
       decl k
       decl l
       decl s
+      decl idx
 
       decl sh_dxd = Real("sh_dxd",     :local => true, :dim => [Dim(ngll3_padded)] )
       decl sh_dyd = Real("sh_dyd",     :local => true, :dim => [Dim(ngll3_padded)] )
@@ -96,18 +100,21 @@ def BOAST::compute_seismograms_kernel(n_gllx = 5, n_gll2 = 25, n_gll3 = 125, n_g
         }
         comment()
 
+        print idx === INDEX3(ndim,nrec_local,0,irec_local,seismo_current)
+        comment()
+
         print If (tx == 0) {
-          print seismograms[irec_local*3 + 0] === scale_displ * (
+          print seismograms[idx + 0] === scale_displ * (
             nu[(irec_local*3)*3 + 0]*sh_dxd[0] + nu[(irec_local*3 + 1)*3 + 0]*sh_dyd[0] + nu[(irec_local*3 + 2)*3 + 0]*sh_dzd[0]
             )
        }
         print If (tx == 1) {
-          print seismograms[irec_local*3 + 1] === scale_displ * (
+          print seismograms[idx + 1] === scale_displ * (
             nu[(irec_local*3)*3 + 1]*sh_dxd[0] + nu[(irec_local*3 + 1)*3 + 1]*sh_dyd[0] + nu[(irec_local*3 + 2)*3 + 1]*sh_dzd[0]
             )
        }
         print If (tx == 2) {
-          print seismograms[irec_local*3 + 2] === scale_displ * (
+          print seismograms[idx + 2] === scale_displ * (
             nu[(irec_local*3)*3 + 2]*sh_dxd[0] + nu[(irec_local*3 + 1)*3 + 2]*sh_dyd[0] + nu[(irec_local*3 + 2)*3 + 2]*sh_dzd[0]
             )
        }
diff --git a/src/gpu/boast/inner_core_impl_kernel_forward.rb b/src/gpu/boast/inner_core_impl_kernel_forward.rb
index 8621a6223..4e349b7cc 100644
--- a/src/gpu/boast/inner_core_impl_kernel_forward.rb
+++ b/src/gpu/boast/inner_core_impl_kernel_forward.rb
@@ -1032,7 +1032,7 @@ def BOAST::impl_kernel(type, forward, ref = true, elem_per_thread = 1, mesh_colo
         comment()
 
         # compilation pragma info
-        if (type == :crust_mantle and get_lang == CUDA and forward) then
+        if (type == :crust_mantle and (get_lang == CUDA or get_lang == HIP) and forward) then
           get_output.puts "#ifdef #{manually_unrolled_loops}"
           get_output.puts "#pragma message (\"\\n\\nCompiling with: #{manually_unrolled_loops} enabled\\n\")"
           get_output.puts "#endif  // #{manually_unrolled_loops}"
diff --git a/src/gpu/compute_add_sources_elastic_gpu.c b/src/gpu/compute_add_sources_elastic_gpu.c
index 51a7795a0..3f8ba0f1a 100644
--- a/src/gpu/compute_add_sources_elastic_gpu.c
+++ b/src/gpu/compute_add_sources_elastic_gpu.c
@@ -32,8 +32,8 @@
 extern EXTERN_LANG
 void FC_FUNC_ (compute_add_sources_gpu,
                COMPUTE_ADD_SOURCES_GPU) (long *Mesh_pointer_f,
-                                         int *NSOURCESf,
-                                         double *h_stf_pre_compute) {
+                                         int *it_f,
+                                         int *istage_f) {
 
   TRACE ("compute_add_sources_gpu");
 
@@ -43,13 +43,22 @@ void FC_FUNC_ (compute_add_sources_gpu,
   // checks if anything to do
   if (mp->nsources_local == 0) return;
 
-  int NSOURCES = *NSOURCESf;
+  int it = *it_f - 1;   // -1 for Fortran -> C indexing differences
+  int istage = *istage_f - 1;
 
   int num_blocks_x, num_blocks_y;
-  get_blocks_xy (NSOURCES, &num_blocks_x, &num_blocks_y);
-
+  get_blocks_xy (mp->nsources_local, &num_blocks_x, &num_blocks_y);
+
+  // note: we try to avoid these copies `gpuCopy_todevice**` as they synchronize all streams and kernel executions.
+  //       this compute_add_sources_kernel is executing faster for a small number of sources than the next kernel launches,
+  //       thus we can not overlap and compute in an asynchronous way, but need to wait idle for kernel triggering.
+  //
+  //       to avoid this, we now allocate all local source time function arrays on the GPU in the preparation routine.
+  //       this will need more GPU memory and might need to be re-evaluated if the number of sources and time steps become very large.
+  //
   // copies source time function buffer values to GPU
-  gpuCopy_todevice_double (&mp->d_stf_pre_compute, h_stf_pre_compute, NSOURCES);
+  //gpuCopy_todevice_double (&mp->d_stf_pre_compute, h_stf_pre_compute, NSOURCES);
+
 
 #ifdef USE_OPENCL
   if (run_opencl) {
@@ -60,12 +69,13 @@ void FC_FUNC_ (compute_add_sources_gpu,
     // adds source contributions
     clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_accel_crust_mantle.ocl));
     clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ibool_crust_mantle.ocl));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_sourcearrays.ocl));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_stf_pre_compute.ocl));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->myrank));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_islice_selected_source.ocl));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ispec_selected_source.ocl));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &NSOURCES));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_sourcearrays_local.ocl));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_stf_local.ocl));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ispec_selected_source_local.ocl));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->nsources_local));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->NSTEP));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &it));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &istage));
 
     local_work_size[0] = NGLLX;
     local_work_size[1] = NGLLX;
@@ -85,12 +95,13 @@ void FC_FUNC_ (compute_add_sources_gpu,
     // adds source contributions
     compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle.cuda,
                                                                       mp->d_ibool_crust_mantle.cuda,
-                                                                      mp->d_sourcearrays.cuda,
-                                                                      mp->d_stf_pre_compute.cuda,
-                                                                      mp->myrank,
-                                                                      mp->d_islice_selected_source.cuda,
-                                                                      mp->d_ispec_selected_source.cuda,
-                                                                      NSOURCES);
+                                                                      mp->d_sourcearrays_local.cuda,
+                                                                      mp->d_stf_local.cuda,
+                                                                      mp->d_ispec_selected_source_local.cuda,
+                                                                      mp->nsources_local,
+                                                                      mp->NSTEP,
+                                                                      it,
+                                                                      istage);
   }
 #endif
 #ifdef USE_HIP
@@ -101,12 +112,13 @@ void FC_FUNC_ (compute_add_sources_gpu,
     hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_add_sources_kernel), grid, threads, 0, mp->compute_stream,
                                                                     mp->d_accel_crust_mantle.hip,
                                                                     mp->d_ibool_crust_mantle.hip,
-                                                                    mp->d_sourcearrays.hip,
-                                                                    mp->d_stf_pre_compute.hip,
-                                                                    mp->myrank,
-                                                                    mp->d_islice_selected_source.hip,
-                                                                    mp->d_ispec_selected_source.hip,
-                                                                    NSOURCES);
+                                                                    mp->d_sourcearrays_local.hip,
+                                                                    mp->d_stf_local.hip,
+                                                                    mp->d_ispec_selected_source_local.hip,
+                                                                    mp->nsources_local,
+                                                                    mp->NSTEP,
+                                                                    it,
+                                                                    istage);
   }
 #endif
 
@@ -120,8 +132,8 @@ void FC_FUNC_ (compute_add_sources_gpu,
 extern EXTERN_LANG
 void FC_FUNC_ (compute_add_sources_backward_gpu,
                COMPUTE_ADD_SOURCES_BACKWARD_GPU) (long *Mesh_pointer_f,
-                                                  int *NSOURCESf,
-                                                  double *h_stf_pre_compute) {
+                                                  int *it_tmp_f,
+                                                  int *istage_f) {
   TRACE ("compute_add_sources_backward_gpu");
   // debug
   DEBUG_BACKWARD_SOURCES ();
@@ -132,13 +144,44 @@ void FC_FUNC_ (compute_add_sources_backward_gpu,
   // checks if anything to do
   if (mp->nsources_local == 0) return;
 
-  int NSOURCES = *NSOURCESf;
+  // indexing: for default backward, we can use the forward stf_local array with corresponding indexing.
+  // for example, Newmark stepping:
+  //    backward time_t = dble(NSTEP-it_tmp)*DT - t0 with it_tmp 1..NSTEP
+  //    forward  time_t = dble(it_tmp-1)*DT - t0  with it_tmp 1..NSTEP
+  //    -> backward indexing:
+  //         it_tmp_backward == 1      -> it_tmp_forward = NSTEP
+  //         it_tmp_backward == 2      -> it_tmp_forward = NSTEP-1
+  //         ..
+  //         it_tmp_backward == NSTEP  -> it_tmp_forward = 1
+  //         and b_stf_local == stf_local(istage, NSTEP-(it_tmp_backward-1), isource_local)
+  // only exception is LDDRK and non-undo_attenuation stepping
+  gpu_realw_mem stf_local;
+  int it_forward;
+
+  stf_local = mp->d_stf_local;
+  it_forward = mp->NSTEP - (*it_tmp_f - 1);
+
+  // LDDRK case for non-undo-attenuation stepping
+  if (mp->use_b_stf){
+   stf_local = mp->d_b_stf_local;
+   it_forward = *it_tmp_f;  // no need to reverse order, b_stf_local is stored in same it_tmp 1..NSTEP order for backward stepping
+  }
 
-  int num_blocks_x, num_blocks_y;
-  get_blocks_xy (NSOURCES, &num_blocks_x, &num_blocks_y);
+  int it = it_forward - 1;   // -1 for Fortran -> C indexing differences
+  int istage = *istage_f - 1;
 
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy (mp->nsources_local, &num_blocks_x, &num_blocks_y);
+
+  // note: we try to avoid these copies `gpuCopy_todevice**` as they synchronize all streams and kernel executions.
+  //       this compute_add_sources_kernel is executing faster for a small number of sources than the next kernel launches,
+  //       thus we can not overlap and compute in an asynchronous way, but need to wait idle for kernel triggering.
+  //
+  //       to avoid this, we now allocate all local source time function arrays on the GPU in the preparation routine.
+  //       this will need more GPU memory and might need to be re-evaluated if the number of sources and time steps become very large.
+  //
   // copies source time function buffer values to GPU
-  gpuCopy_todevice_double (&mp->d_stf_pre_compute, h_stf_pre_compute, NSOURCES);
+  //gpuCopy_todevice_double (&mp->d_stf_pre_compute, h_stf_pre_compute, NSOURCES);
 
 #ifdef USE_OPENCL
   if (run_opencl) {
@@ -149,12 +192,13 @@ void FC_FUNC_ (compute_add_sources_backward_gpu,
     // adds source contributions
     clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_b_accel_crust_mantle.ocl));
     clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ibool_crust_mantle.ocl));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_sourcearrays.ocl));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_stf_pre_compute.ocl));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->myrank));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_islice_selected_source.ocl));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ispec_selected_source.ocl));
-    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &NSOURCES));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_sourcearrays_local.ocl));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &stf_local.ocl));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ispec_selected_source_local.ocl));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->nsources_local));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &mp->NSTEP));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &it));
+    clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_kernel, idx++, sizeof (int), (void *) &istage));
 
     local_work_size[0] = NGLLX;
     local_work_size[1] = NGLLX;
@@ -174,12 +218,13 @@ void FC_FUNC_ (compute_add_sources_backward_gpu,
 
     compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle.cuda,
                                                                       mp->d_ibool_crust_mantle.cuda,
-                                                                      mp->d_sourcearrays.cuda,
-                                                                      mp->d_stf_pre_compute.cuda,
-                                                                      mp->myrank,
-                                                                      mp->d_islice_selected_source.cuda,
-                                                                      mp->d_ispec_selected_source.cuda,
-                                                                      NSOURCES);
+                                                                      mp->d_sourcearrays_local.cuda,
+                                                                      stf_local.cuda,
+                                                                      mp->d_ispec_selected_source_local.cuda,
+                                                                      mp->nsources_local,
+                                                                      mp->NSTEP,
+                                                                      it,
+                                                                      istage);
   }
 #endif
 #ifdef USE_HIP
@@ -190,12 +235,13 @@ void FC_FUNC_ (compute_add_sources_backward_gpu,
     hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_add_sources_kernel), grid, threads, 0, mp->compute_stream,
                                                                     mp->d_b_accel_crust_mantle.hip,
                                                                     mp->d_ibool_crust_mantle.hip,
-                                                                    mp->d_sourcearrays.hip,
-                                                                    mp->d_stf_pre_compute.hip,
-                                                                    mp->myrank,
-                                                                    mp->d_islice_selected_source.hip,
-                                                                    mp->d_ispec_selected_source.hip,
-                                                                    NSOURCES);
+                                                                    mp->d_sourcearrays_local.hip,
+                                                                    stf_local.hip,
+                                                                    mp->d_ispec_selected_source_local.hip,
+                                                                    mp->nsources_local,
+                                                                    mp->NSTEP,
+                                                                    it,
+                                                                    istage);
   }
 #endif
 
diff --git a/src/gpu/compute_forces_crust_mantle_gpu.c b/src/gpu/compute_forces_crust_mantle_gpu.c
index 70f124da5..67fe3b314 100644
--- a/src/gpu/compute_forces_crust_mantle_gpu.c
+++ b/src/gpu/compute_forces_crust_mantle_gpu.c
@@ -203,6 +203,10 @@ void crust_mantle (int nb_blocks_to_compute, Mesh *mp,
     tau_sigmainvval = mp->d_tau_sigmainvval; // only d_tau_sigmainvval
   }
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -794,6 +798,21 @@ void crust_mantle (int nb_blocks_to_compute, Mesh *mp,
   }
 #endif
 
+  // kernel timing
+  if (GPU_TIMING){
+    stop_timing_gpu(&start,&stop,"crust_mantle");
+
+    //realw time;
+    //stop_timing_gpu_t(&start,&stop,"crust_mantle kernel",&time);
+    // time in seconds
+    //time = time / 1000.;
+    // performance
+    // see with: nvprof --metrics flops_sp ./xspecfem3D -> using 883146240 FLOPS (Single) floating-point operations
+    // hand-counts: 89344 * number-of-blocks
+    //realw flops = 89344 * nb_blocks_to_compute;
+    //printf("  performance: %f GFlops/s\n", flops/time *(1./1000./1000./1000.));
+  }
+
   GPU_ERROR_CHECKING ("crust_mantle");
 }
 
diff --git a/src/gpu/compute_forces_inner_core_gpu.c b/src/gpu/compute_forces_inner_core_gpu.c
index baf48fc88..968f3b6d5 100644
--- a/src/gpu/compute_forces_inner_core_gpu.c
+++ b/src/gpu/compute_forces_inner_core_gpu.c
@@ -176,6 +176,10 @@ void inner_core (int nb_blocks_to_compute, Mesh *mp,
     tau_sigmainvval = mp->d_tau_sigmainvval; // only d_tau_sigmainvval
   }
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -638,6 +642,9 @@ void inner_core (int nb_blocks_to_compute, Mesh *mp,
   }
 #endif
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"inner_core"); }
+
   GPU_ERROR_CHECKING ("inner_core");
 }
 
diff --git a/src/gpu/compute_forces_outer_core_gpu.c b/src/gpu/compute_forces_outer_core_gpu.c
index 92e48b713..15cb68459 100644
--- a/src/gpu/compute_forces_outer_core_gpu.c
+++ b/src/gpu/compute_forces_outer_core_gpu.c
@@ -109,6 +109,10 @@ void outer_core (int nb_blocks_to_compute, Mesh *mp,
     B_array_rotation_lddrk = d_b_B_array_rotation_lddrk;
   }
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -282,6 +286,9 @@ void outer_core (int nb_blocks_to_compute, Mesh *mp,
   }
 #endif
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"outer_core"); }
+
   GPU_ERROR_CHECKING ("kernel outer_core");
 }
 
diff --git a/src/gpu/compute_seismograms_gpu.c b/src/gpu/compute_seismograms_gpu.c
index e26fb608b..1eb4de1b5 100644
--- a/src/gpu/compute_seismograms_gpu.c
+++ b/src/gpu/compute_seismograms_gpu.c
@@ -31,34 +31,36 @@
 
 extern EXTERN_LANG
 void FC_FUNC_ (compute_seismograms_gpu,
-               COMPUTE_SEISMOGRAMS_GPU) (long *Mesh_pointer_f,
+               COMPUTE_SEISMOGRAMS_GPU) (long *Mesh_pointer,
                                          realw* seismograms,
-                                         int* seismo_currentf,
-                                         int* itf,
-                                         int* it_endf,
-                                         double* scale_displf,
-                                         int* NTSTEP_BETWEEN_OUTPUT_SEISMOSf,
-                                         int* NSTEPf) {
+                                         int* seismo_current_f,
+                                         int* it_f,
+                                         int* it_end_f,
+                                         double* scale_displ_f,
+                                         int* nlength_seismogram_f) {
   TRACE ("compute_seismograms_gpu");
 
   //get Mesh from Fortran integer wrapper
-  Mesh *mp = (Mesh *) *Mesh_pointer_f;
+  Mesh *mp = (Mesh *) *Mesh_pointer;
 
   // checks if anything to do
   if (mp->nrec_local == 0) return;
 
-  int seismo_current = *seismo_currentf - 1 ;
-  int NTSTEP_BETWEEN_OUTPUT_SEISMOS = *NTSTEP_BETWEEN_OUTPUT_SEISMOSf;
-  int NSTEP = *NSTEPf;
-  int it = *itf;
-  int it_end = *it_endf;
-  realw scale_displ = (realw)(*scale_displf);
+  int seismo_current = *seismo_current_f - 1 ;  // -1 for Fortran -> C indexing
+
+  int nlength_seismogram = *nlength_seismogram_f;
+  int it = *it_f;
+  int it_end = *it_end_f;
+
+  realw scale_displ = (realw)(*scale_displ_f);
 
   int blocksize = NGLL3_PADDED;
   int num_blocks_x, num_blocks_y;
   get_blocks_xy (mp->nrec_local, &num_blocks_x, &num_blocks_y);
 
-  int size_buffer = 3 * mp->nrec_local * sizeof(realw);
+  // seismogram array size
+  int size = mp->nrec_local * nlength_seismogram;
+  int size_buffer = NDIM * size * sizeof(realw);
 
   // determines wavefield depending on simulation type
   gpu_realw_mem displ;
@@ -70,6 +72,15 @@ void FC_FUNC_ (compute_seismograms_gpu,
     displ = mp->d_displ_crust_mantle;
   }
 
+  // note: due to subsampling, the last time step it == it_end might not be reached,
+  //       but computing seismogram entries might end before.
+  //       thus, both checks
+  //         it%NTSTEP_BETWEEN_OUTPUT_SEISMOS == 0 || it == it_end
+  //       might not be reached. instead we test if the seismogram array is full by
+  //         seismo_current == nlength_seismogram - 1
+  //       and copy it back whenever.
+  //printf("debug: gpu seismo: seismo current/lenght %i/%i - it/it_end %i/%i\n",seismo_current,nlength_seismogram,it,it_end);
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -89,6 +100,7 @@ void FC_FUNC_ (compute_seismograms_gpu,
     clCheck (clSetKernelArg (mocl.kernels.compute_seismograms_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_ispec_selected_rec.ocl));
     clCheck (clSetKernelArg (mocl.kernels.compute_seismograms_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_number_receiver_global.ocl));
     clCheck (clSetKernelArg (mocl.kernels.compute_seismograms_kernel, idx++, sizeof (realw),  (void *) &scale_displ));
+    clCheck (clSetKernelArg (mocl.kernels.compute_seismograms_kernel, idx++, sizeof (int),    (void *) &seismo_current));
 
     local_work_size[0] = blocksize;
     local_work_size[1] = 1;
@@ -98,21 +110,29 @@ void FC_FUNC_ (compute_seismograms_gpu,
     clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.compute_seismograms_kernel, 2, NULL,
                                      global_work_size, local_work_size, 0, NULL, &kernel_evt));
 
-    // copies buffer to CPU
-    if (GPU_ASYNC_COPY &&  ((seismo_current+1) != NTSTEP_BETWEEN_OUTPUT_SEISMOS) && (it != NSTEP) && (it != it_end) ) {
-      // waits until kernel is finished before starting async memcpy
-      clCheck (clFinish (mocl.command_queue));
-
-      if (mp->has_last_copy_evt) {
-        clCheck (clReleaseEvent (mp->last_copy_evt));
-      }
-      clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_seismograms.ocl, CL_FALSE, 0,
-                                    size_buffer, seismograms + 3*mp->nrec_local*seismo_current, 1, &kernel_evt, &mp->last_copy_evt));
-      mp->has_last_copy_evt = 1;
-    } else {
-      // blocking copy
-      clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_seismograms.ocl, CL_TRUE, 0,
-                                    size_buffer , seismograms + 3*mp->nrec_local*seismo_current, 0, NULL, NULL));
+    // copies array to CPU host
+    if (seismo_current == nlength_seismogram - 1 || it == it_end){
+      // intermediate copies could be made asynchronous,
+      // only the last copy would be a synchronous copy to make sure we have all previous copies done.
+      //
+      // however, note that asynchronous copies in the copy stream would also require pinned host memory.
+      // This is not use here yet for seismograms, thus the copy becomes blocking by default as well.
+      // we'll leave the if-case here as a todo for future...
+      //if (GPU_ASYNC_COPY && (it != it_end)) {
+      //  // waits until kernel is finished before starting async memcpy
+      //  clCheck (clFinish (mocl.command_queue));
+      //
+      //  if (mp->has_last_copy_evt) {
+      //    clCheck (clReleaseEvent (mp->last_copy_evt));
+      //  }
+      //  clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_seismograms.ocl, CL_FALSE, 0,
+      //                                size_buffer, seismograms, 1, &kernel_evt, &mp->last_copy_evt));
+      //  mp->has_last_copy_evt = 1;
+      //} else {
+        // blocking copy
+        clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_seismograms.ocl, CL_TRUE, 0,
+                                      size_buffer , seismograms, 0, NULL, NULL));
+      //}
     }
 
     clReleaseEvent (kernel_evt);
@@ -132,17 +152,25 @@ void FC_FUNC_ (compute_seismograms_gpu,
                                                                       mp->d_nu.cuda,
                                                                       mp->d_ispec_selected_rec.cuda,
                                                                       mp->d_number_receiver_global.cuda,
-                                                                      scale_displ);
-    // copies buffer to CPU
-    if (GPU_ASYNC_COPY &&  ((seismo_current+1) != NTSTEP_BETWEEN_OUTPUT_SEISMOS) && (it != NSTEP) && (it != it_end) ) {
-      // waits until kernel is finished before starting async memcpy
-      cudaStreamSynchronize(mp->compute_stream);
-      // copies buffer to CPU
-      cudaMemcpyAsync(seismograms + 3*mp->nrec_local*seismo_current,mp->d_seismograms.cuda,size_buffer,cudaMemcpyDeviceToHost,mp->copy_stream);
-    } else {
-      // synchronous copy
-      print_CUDA_error_if_any(cudaMemcpy(seismograms + 3*mp->nrec_local*seismo_current,mp->d_seismograms.cuda,size_buffer,
-                                         cudaMemcpyDeviceToHost),98000);
+                                                                      scale_displ,
+                                                                      seismo_current);
+    // copies array to CPU host
+    if (seismo_current == nlength_seismogram - 1 || it == it_end){
+      // intermediate copies could be made asynchronous,
+      // only the last copy would be a synchronous copy to make sure we have all previous copies done.
+      //
+      // however, note that asynchronous copies in the copy stream would also require pinned host memory.
+      // This is not use here yet for seismograms, thus the copy becomes blocking by default as well.
+      // we'll leave the if-case here as a todo for future...
+      //if (GPU_ASYNC_COPY &&  (it != it_end)) {
+      //  // waits until kernel is finished before starting async memcpy
+      //  cudaStreamSynchronize(mp->compute_stream);
+      //  // copies buffer to CPU
+      //  cudaMemcpyAsync(seismograms,mp->d_seismograms.cuda,size_buffer,cudaMemcpyDeviceToHost,mp->copy_stream);
+      //} else {
+        // synchronous copy
+        print_CUDA_error_if_any(cudaMemcpy(seismograms,mp->d_seismograms.cuda,size_buffer,cudaMemcpyDeviceToHost),98000);
+      //}
     }
   }
 #endif
@@ -161,17 +189,26 @@ void FC_FUNC_ (compute_seismograms_gpu,
                                                                     mp->d_nu.hip,
                                                                     mp->d_ispec_selected_rec.hip,
                                                                     mp->d_number_receiver_global.hip,
-                                                                    scale_displ);
-    // copies buffer to CPU
-    if (GPU_ASYNC_COPY &&  ((seismo_current+1) != NTSTEP_BETWEEN_OUTPUT_SEISMOS) && (it != NSTEP) && (it != it_end) ) {
-      // waits until kernel is finished before starting async memcpy
-      hipStreamSynchronize(mp->compute_stream);
-      // copies buffer to CPU
-      hipMemcpyAsync(seismograms + 3*mp->nrec_local*seismo_current,mp->d_seismograms.hip,size_buffer,hipMemcpyDeviceToHost,mp->copy_stream);
-    } else {
-      // synchronous copy
-      print_HIP_error_if_any(hipMemcpy(seismograms + 3*mp->nrec_local*seismo_current,mp->d_seismograms.hip,size_buffer,
-                                       hipMemcpyDeviceToHost),98000);
+                                                                    scale_displ,
+                                                                    seismo_current);
+
+    // copies array to CPU host
+    if (seismo_current == nlength_seismogram - 1 || it == it_end){
+      // intermediate copies could be made asynchronous,
+      // only the last copy would be a synchronous copy to make sure we have all previous copies done.
+      //
+      // however, note that asynchronous copies in the copy stream would also require pinned host memory.
+      // This is not use here yet for seismograms, thus the copy becomes blocking by default as well.
+      // we'll leave the if-case here as a todo for future...
+      //if (GPU_ASYNC_COPY &&  (it != it_end)) {
+      //  // waits until kernel is finished before starting async memcpy
+      //  hipStreamSynchronize(mp->compute_stream);
+      //  // copies buffer to CPU
+      //  hipMemcpyAsync(seismograms,mp->d_seismograms.hip,size_buffer,hipMemcpyDeviceToHost,mp->copy_stream);
+      //} else {
+        // synchronous copy
+        print_HIP_error_if_any(hipMemcpy(seismograms,mp->d_seismograms.hip,size_buffer,hipMemcpyDeviceToHost),98000);
+      //}
     }
   }
 #endif
diff --git a/src/gpu/gpu_buffer_list.c b/src/gpu/gpu_buffer_list.c
index 462354069..f51b16676 100644
--- a/src/gpu/gpu_buffer_list.c
+++ b/src/gpu/gpu_buffer_list.c
@@ -311,11 +311,19 @@ GPU_REALW_BUFFER (d_b_B_array_rotation);
 // ------------------------------------------------------------------   //
 // sources
 // ------------------------------------------------------------------   //
-GPU_REALW_BUFFER (d_sourcearrays);
-GPU_DOUBLE_BUFFER (d_stf_pre_compute);
-GPU_INT_BUFFER (d_islice_selected_source);
+// full NSOURCES arrays not needed anymore...
+//GPU_REALW_BUFFER (d_sourcearrays);
+//GPU_DOUBLE_BUFFER (d_stf_pre_compute);
+//GPU_INT_BUFFER (d_islice_selected_source);
+// for pure adjoint simulation cases
 GPU_INT_BUFFER (d_ispec_selected_source);
 
+// local arrays
+GPU_INT_BUFFER (d_ispec_selected_source_local);
+GPU_REALW_BUFFER (d_sourcearrays_local);
+GPU_REALW_BUFFER (d_stf_local);
+GPU_REALW_BUFFER (d_b_stf_local);
+
 // ------------------------------------------------------------------   //
 // receivers
 // ------------------------------------------------------------------   //
diff --git a/src/gpu/helper_functions_gpu.c b/src/gpu/helper_functions_gpu.c
index b6023d3af..1193db2cc 100644
--- a/src/gpu/helper_functions_gpu.c
+++ b/src/gpu/helper_functions_gpu.c
@@ -491,7 +491,7 @@ void gpuCopy_from_device_realw (gpu_realw_mem *d_array_addr_ptr, realw *h_array,
 // copy with offset
 void gpuCopy_from_device_realw_offset (gpu_realw_mem *d_array_addr_ptr, realw *h_array, size_t size, size_t offset) {
 
-  TRACE ("gpuCopy_from_device_realw");
+  TRACE ("gpuCopy_from_device_realw_offset");
 
   // checks if anything to do
   if (size == 0){ return; }
@@ -515,7 +515,7 @@ void gpuCopy_from_device_realw_offset (gpu_realw_mem *d_array_addr_ptr, realw *h
   }
 #endif
 
-  GPU_ERROR_CHECKING ("gpuCopy_from_device_realw");
+  GPU_ERROR_CHECKING ("gpuCopy_from_device_realw_offset");
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -1154,7 +1154,7 @@ void start_timing_gpu(gpu_event* start,gpu_event* stop) {
 #endif
 }
 
-void stop_timing_gpu(gpu_event* start,gpu_event* stop, char* info_str) {
+void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str) {
   realw time = 0;
   // stops events
 #ifdef USE_OPENCL
@@ -1177,7 +1177,40 @@ void stop_timing_gpu(gpu_event* start,gpu_event* stop, char* info_str) {
   hipEventDestroy( *stop );
 #endif
   // user output
-  printf("%s: Execution Time = %f ms\n",info_str,time);
+  printf("GPU_timing %s: Execution Time = %f ms\n",info_str,time);
+}
+
+// note: C doesn't allow for function overloading.
+//       thus, naming this routine slightly different, with `_t` added, as it returns back time t.
+
+void stop_timing_gpu_t(gpu_event* start,gpu_event* stop, const char* info_str, realw* t){
+  realw time = 0;
+  // stops events
+#ifdef USE_OPENCL
+// not fully implemented yet...
+  clReleaseEvent(*start);
+  clReleaseEvent(*stop);
+#endif
+#ifdef USE_CUDA
+  // stops events
+  cudaEventRecord( *stop, 0);
+  cudaEventSynchronize( *stop );
+  cudaEventElapsedTime( &time, *start, *stop );
+  cudaEventDestroy( *start );
+  cudaEventDestroy( *stop );
+#endif
+#ifdef USE_HIP
+  // stops events
+  hipEventRecord( *stop, 0);
+  hipEventSynchronize( *stop );
+  hipEventElapsedTime( &time, *start, *stop );
+  hipEventDestroy( *start );
+  hipEventDestroy( *stop );
+#endif
+  // user output
+  printf("GPU_timing %s: Execution Time = %f ms\n",info_str,time);
+  // returns time
+  *t = time;
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -1331,6 +1364,22 @@ void FC_FUNC_ (pause_for_debug,
   pause_for_debugger (1);
 }
 
+/*----------------------------------------------------------------------------------------------- */
+
+// external function wrapper to synchronize GPU from Fortran routine
+
+extern EXTERN_LANG
+void FC_FUNC_ (gpu_synchronize,
+               GPU_SYNCHRONIZE) () {
+  TRACE ("gpu_synchronize");
+
+  // synchronize device kernels
+  gpuSynchronize();
+
+  // checks for previous errors
+  exit_on_gpu_error("gpuSynchronize");
+}
+
 /* ----------------------------------------------------------------------------------------------- */
 // MPI synchronization
 /* ----------------------------------------------------------------------------------------------- */
diff --git a/src/gpu/kernels.gen/compute_add_sources_kernel.cpp b/src/gpu/kernels.gen/compute_add_sources_kernel.cpp
index 6a52209c6..759b84fda 100644
--- a/src/gpu/kernels.gen/compute_add_sources_kernel.cpp
+++ b/src/gpu/kernels.gen/compute_add_sources_kernel.cpp
@@ -81,7 +81,7 @@
 #define BLOCKSIZE_TRANSFER 256
 #endif
 
-__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays, const double * stf_pre_compute, const int myrank, const int * islice_selected_source, const int * ispec_selected_source, const int NSOURCES){
+__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays_local, const float * stf_local, const int * ispec_selected_source_local, const int nsources_local, const int NSTEP, const int it, const int istage){
   int ispec;
   int iglob;
   float stf;
@@ -93,14 +93,12 @@ __global__ void compute_add_sources_kernel(float * accel, const int * ibool, con
   j = threadIdx.y;
   k = threadIdx.z;
   isource = blockIdx.x + (gridDim.x) * (blockIdx.y);
-  if (isource < NSOURCES) {
-    if (myrank == islice_selected_source[isource]) {
-      ispec = ispec_selected_source[isource] - (1);
-      stf = stf_pre_compute[isource];
-      iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1);
-      atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf));
-      atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf));
-      atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf));
-    }
+  if (isource < nsources_local) {
+    ispec = ispec_selected_source_local[isource] - (1);
+    stf = stf_local[INDEX3(nsources_local, NSTEP, isource, it, istage)];
+    iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1);
+    atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf));
+    atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf));
+    atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf));
   }
 }
diff --git a/src/gpu/kernels.gen/compute_add_sources_kernel.cu b/src/gpu/kernels.gen/compute_add_sources_kernel.cu
index 6a52209c6..759b84fda 100644
--- a/src/gpu/kernels.gen/compute_add_sources_kernel.cu
+++ b/src/gpu/kernels.gen/compute_add_sources_kernel.cu
@@ -81,7 +81,7 @@
 #define BLOCKSIZE_TRANSFER 256
 #endif
 
-__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays, const double * stf_pre_compute, const int myrank, const int * islice_selected_source, const int * ispec_selected_source, const int NSOURCES){
+__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays_local, const float * stf_local, const int * ispec_selected_source_local, const int nsources_local, const int NSTEP, const int it, const int istage){
   int ispec;
   int iglob;
   float stf;
@@ -93,14 +93,12 @@ __global__ void compute_add_sources_kernel(float * accel, const int * ibool, con
   j = threadIdx.y;
   k = threadIdx.z;
   isource = blockIdx.x + (gridDim.x) * (blockIdx.y);
-  if (isource < NSOURCES) {
-    if (myrank == islice_selected_source[isource]) {
-      ispec = ispec_selected_source[isource] - (1);
-      stf = stf_pre_compute[isource];
-      iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1);
-      atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf));
-      atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf));
-      atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf));
-    }
+  if (isource < nsources_local) {
+    ispec = ispec_selected_source_local[isource] - (1);
+    stf = stf_local[INDEX3(nsources_local, NSTEP, isource, it, istage)];
+    iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1);
+    atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf));
+    atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf));
+    atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf));
   }
 }
diff --git a/src/gpu/kernels.gen/compute_add_sources_kernel_cl.c b/src/gpu/kernels.gen/compute_add_sources_kernel_cl.c
index e56969838..93bd4c2df 100644
--- a/src/gpu/kernels.gen/compute_add_sources_kernel_cl.c
+++ b/src/gpu/kernels.gen/compute_add_sources_kernel_cl.c
@@ -93,7 +93,7 @@ inline void atomicAdd(volatile __global float *source, const float val) {\n\
 #define BLOCKSIZE_TRANSFER 256\n\
 #endif\n\
 \n\
-__kernel void compute_add_sources_kernel(__global float * accel, const __global int * ibool, const __global float * sourcearrays, const __global double * stf_pre_compute, const int myrank, const __global int * islice_selected_source, const __global int * ispec_selected_source, const int NSOURCES){\n\
+__kernel void compute_add_sources_kernel(__global float * accel, const __global int * ibool, const __global float * sourcearrays_local, const __global float * stf_local, const __global int * ispec_selected_source_local, const int nsources_local, const int NSTEP, const int it, const int istage){\n\
   int ispec;\n\
   int iglob;\n\
   float stf;\n\
@@ -105,15 +105,13 @@ __kernel void compute_add_sources_kernel(__global float * accel, const __global
   j = get_local_id(1);\n\
   k = get_local_id(2);\n\
   isource = get_group_id(0) + (get_num_groups(0)) * (get_group_id(1));\n\
-  if (isource < NSOURCES) {\n\
-    if (myrank == islice_selected_source[isource]) {\n\
-      ispec = ispec_selected_source[isource] - (1);\n\
-      stf = stf_pre_compute[isource];\n\
-      iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1);\n\
-      atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf));\n\
-      atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf));\n\
-      atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf));\n\
-    }\n\
+  if (isource < nsources_local) {\n\
+    ispec = ispec_selected_source_local[isource] - (1);\n\
+    stf = stf_local[INDEX3(nsources_local, NSTEP, isource, it, istage)];\n\
+    iglob = ibool[INDEX4(NGLLX, NGLLX, NGLLX, i, j, k, ispec)] - (1);\n\
+    atomicAdd(accel + (iglob) * (3) + 0, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 0, i, j, k, isource)]) * (stf));\n\
+    atomicAdd(accel + (iglob) * (3) + 1, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 1, i, j, k, isource)]) * (stf));\n\
+    atomicAdd(accel + (iglob) * (3) + 2, (sourcearrays_local[INDEX5(NDIM, NGLLX, NGLLX, NGLLX, 2, i, j, k, isource)]) * (stf));\n\
   }\n\
 }\n\
 ";
diff --git a/src/gpu/kernels.gen/compute_seismograms_kernel.cpp b/src/gpu/kernels.gen/compute_seismograms_kernel.cpp
index e2f3556ee..e691fb6ba 100644
--- a/src/gpu/kernels.gen/compute_seismograms_kernel.cpp
+++ b/src/gpu/kernels.gen/compute_seismograms_kernel.cpp
@@ -48,7 +48,7 @@
 #define NDIM 3
 #endif
 #ifndef NGLLX
-#define NGLLX false
+#define NGLLX 5
 #endif
 #ifndef NGLL2
 #define NGLL2 25
@@ -81,7 +81,7 @@
 #define BLOCKSIZE_TRANSFER 256
 #endif
 
-__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ){
+__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ, const int seismo_current){
   int ispec;
   int iglob;
   int irec_local;
@@ -93,6 +93,7 @@ __global__ void compute_seismograms_kernel(const int nrec_local, const float * d
   int k;
   int l;
   int s;
+  int idx;
   __shared__ float sh_dxd[(NGLL3_PADDED)];
   __shared__ float sh_dyd[(NGLL3_PADDED)];
   __shared__ float sh_dzd[(NGLL3_PADDED)];
@@ -178,14 +179,16 @@ __global__ void compute_seismograms_kernel(const int nrec_local, const float * d
     __syncthreads();
     l = (l) * (2);
 
+    idx = INDEX3(NDIM, nrec_local, 0, irec_local, seismo_current);
+
     if (tx == 0) {
-      seismograms[(irec_local) * (3) + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0]));
+      seismograms[idx + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0]));
     }
     if (tx == 1) {
-      seismograms[(irec_local) * (3) + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0]));
+      seismograms[idx + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0]));
     }
     if (tx == 2) {
-      seismograms[(irec_local) * (3) + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0]));
+      seismograms[idx + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0]));
     }
   }
 }
diff --git a/src/gpu/kernels.gen/compute_seismograms_kernel.cu b/src/gpu/kernels.gen/compute_seismograms_kernel.cu
index e2f3556ee..e691fb6ba 100644
--- a/src/gpu/kernels.gen/compute_seismograms_kernel.cu
+++ b/src/gpu/kernels.gen/compute_seismograms_kernel.cu
@@ -48,7 +48,7 @@
 #define NDIM 3
 #endif
 #ifndef NGLLX
-#define NGLLX false
+#define NGLLX 5
 #endif
 #ifndef NGLL2
 #define NGLL2 25
@@ -81,7 +81,7 @@
 #define BLOCKSIZE_TRANSFER 256
 #endif
 
-__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ){
+__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ, const int seismo_current){
   int ispec;
   int iglob;
   int irec_local;
@@ -93,6 +93,7 @@ __global__ void compute_seismograms_kernel(const int nrec_local, const float * d
   int k;
   int l;
   int s;
+  int idx;
   __shared__ float sh_dxd[(NGLL3_PADDED)];
   __shared__ float sh_dyd[(NGLL3_PADDED)];
   __shared__ float sh_dzd[(NGLL3_PADDED)];
@@ -178,14 +179,16 @@ __global__ void compute_seismograms_kernel(const int nrec_local, const float * d
     __syncthreads();
     l = (l) * (2);
 
+    idx = INDEX3(NDIM, nrec_local, 0, irec_local, seismo_current);
+
     if (tx == 0) {
-      seismograms[(irec_local) * (3) + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0]));
+      seismograms[idx + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0]));
     }
     if (tx == 1) {
-      seismograms[(irec_local) * (3) + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0]));
+      seismograms[idx + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0]));
     }
     if (tx == 2) {
-      seismograms[(irec_local) * (3) + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0]));
+      seismograms[idx + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0]));
     }
   }
 }
diff --git a/src/gpu/kernels.gen/compute_seismograms_kernel_cl.c b/src/gpu/kernels.gen/compute_seismograms_kernel_cl.c
index 24aa921fc..7423577c1 100644
--- a/src/gpu/kernels.gen/compute_seismograms_kernel_cl.c
+++ b/src/gpu/kernels.gen/compute_seismograms_kernel_cl.c
@@ -92,7 +92,7 @@ inline void atomicAdd(volatile __global float *source, const float val) {\n\
 #define BLOCKSIZE_TRANSFER 256\n\
 #endif\n\
 \n\
-__kernel void compute_seismograms_kernel(const int nrec_local, const __global float * displ, const __global int * d_ibool, const __global float * hxir, const __global float * hetar, const __global float * hgammar, __global float * seismograms, const __global float * nu, const __global int * ispec_selected_rec, const __global int * number_receiver_global, const float scale_displ){\n\
+__kernel void compute_seismograms_kernel(const int nrec_local, const __global float * displ, const __global int * d_ibool, const __global float * hxir, const __global float * hetar, const __global float * hgammar, __global float * seismograms, const __global float * nu, const __global int * ispec_selected_rec, const __global int * number_receiver_global, const float scale_displ, const int seismo_current){\n\
   int ispec;\n\
   int iglob;\n\
   int irec_local;\n\
@@ -104,6 +104,7 @@ __kernel void compute_seismograms_kernel(const int nrec_local, const __global fl
   int k;\n\
   int l;\n\
   int s;\n\
+  int idx;\n\
   __local float sh_dxd[(NGLL3_PADDED)];\n\
   __local float sh_dyd[(NGLL3_PADDED)];\n\
   __local float sh_dzd[(NGLL3_PADDED)];\n\
@@ -188,15 +189,17 @@ __kernel void compute_seismograms_kernel(const int nrec_local, const __global fl
     }\n\
     barrier(CLK_LOCAL_MEM_FENCE);\n\
     l = (l) * (2);\n\
+\n\
+    idx = INDEX3(NDIM, nrec_local, 0, irec_local, seismo_current);\n\
 \n\
     if (tx == 0) {\n\
-      seismograms[(irec_local) * (3) + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0]));\n\
+      seismograms[idx + 0] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 0]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 0]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 0]) * (sh_dzd[0]));\n\
     }\n\
     if (tx == 1) {\n\
-      seismograms[(irec_local) * (3) + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0]));\n\
+      seismograms[idx + 1] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 1]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 1]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 1]) * (sh_dzd[0]));\n\
     }\n\
     if (tx == 2) {\n\
-      seismograms[(irec_local) * (3) + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0]));\n\
+      seismograms[idx + 2] = (scale_displ) * ((nu[((irec_local) * (3)) * (3) + 2]) * (sh_dxd[0]) + (nu[((irec_local) * (3) + 1) * (3) + 2]) * (sh_dyd[0]) + (nu[((irec_local) * (3) + 2) * (3) + 2]) * (sh_dzd[0]));\n\
     }\n\
   }\n\
 }\n\
diff --git a/src/gpu/kernels.gen/crust_mantle_impl_kernel_forward.cpp b/src/gpu/kernels.gen/crust_mantle_impl_kernel_forward.cpp
index 14c66d904..ed1f8689d 100644
--- a/src/gpu/kernels.gen/crust_mantle_impl_kernel_forward.cpp
+++ b/src/gpu/kernels.gen/crust_mantle_impl_kernel_forward.cpp
@@ -82,6 +82,10 @@
 #endif
 
 
+#ifdef MANUALLY_UNROLLED_LOOPS
+#pragma message ("\n\nCompiling with: MANUALLY_UNROLLED_LOOPS enabled\n")
+#endif  // MANUALLY_UNROLLED_LOOPS
+
 
 static __device__ void compute_element_cm_att_stress(const int tx, const int working_element, const float * R_xx, const float * R_yy, const float * R_xy, const float * R_xz, const float * R_yz, float * sigma_xx, float * sigma_yy, float * sigma_zz, float * sigma_xy, float * sigma_xz, float * sigma_yz){
   int offset;
diff --git a/src/gpu/kernels.gen/kernel_proto.cu.h b/src/gpu/kernels.gen/kernel_proto.cu.h
index 6cf3a6263..d482c0f40 100644
--- a/src/gpu/kernels.gen/kernel_proto.cu.h
+++ b/src/gpu/kernels.gen/kernel_proto.cu.h
@@ -2,7 +2,7 @@ __global__ void assemble_boundary_accel_on_device(float * d_accel, const float *
 __global__ void assemble_boundary_potential_on_device(float * d_potential_dot_dot_acoustic, const float * d_send_potential_dot_dot_buffer, const int num_interfaces, const int max_nibool_interfaces, const int * d_nibool_interfaces, const int * d_ibool_interfaces);
 __global__ void compute_acoustic_kernel(const int * ibool, const float * rhostore, const float * kappastore, const float * hprime_xx, const float * d_xix, const float * d_xiy, const float * d_xiz, const float * d_etax, const float * d_etay, const float * d_etaz, const float * d_gammax, const float * d_gammay, const float * d_gammaz, const float * potential_dot_dot_acoustic, const float * b_potential_acoustic, const float * b_potential_dot_dot_acoustic, float * rho_ac_kl, float * kappa_ac_kl, const float deltat, const int NSPEC);
 __global__ void compute_add_sources_adjoint_kernel(float * accel, const float * source_adjoint, const float * xir, const float * etar, const float * gammar, const int * ibool, const int * ispec_selected_rec, const int * number_adjsources_global, const int nadj_rec_local);
-__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays, const double * stf_pre_compute, const int myrank, const int * islice_selected_source, const int * ispec_selected_source, const int NSOURCES);
+__global__ void compute_add_sources_kernel(float * accel, const int * ibool, const float * sourcearrays_local, const float * stf_local, const int * ispec_selected_source_local, const int nsources_local, const int NSTEP, const int it, const int istage);
 __global__ void compute_ani_kernel(const float * epsilondev_xx, const float * epsilondev_yy, const float * epsilondev_xy, const float * epsilondev_xz, const float * epsilondev_yz, const float * epsilon_trace_over_3, const float * b_epsilondev_xx, const float * b_epsilondev_yy, const float * b_epsilondev_xy, const float * b_epsilondev_xz, const float * b_epsilondev_yz, const float * b_epsilon_trace_over_3, float * cijkl_kl, const int NSPEC, const float deltat);
 __global__ void compute_ani_undoatt_kernel(const float * epsilondev_xx, const float * epsilondev_yy, const float * epsilondev_xy, const float * epsilondev_xz, const float * epsilondev_yz, const float * epsilon_trace_over_3, float * cijkl_kl, const int NSPEC, const float deltat, const int * d_ibool, const float * d_b_displ, const float * d_xix, const float * d_xiy, const float * d_xiz, const float * d_etax, const float * d_etay, const float * d_etaz, const float * d_gammax, const float * d_gammay, const float * d_gammaz, const float * d_hprime_xx);
 __global__ void compute_coupling_CMB_fluid_kernel(const float * displ_crust_mantle, float * accel_crust_mantle, const float * accel_outer_core, const int * ibool_crust_mantle, const int * ibelm_bottom_crust_mantle, const float * normal_top_outer_core, const float * jacobian2D_top_outer_core, const float * wgllwgll_xy, const int * ibool_outer_core, const int * ibelm_top_outer_core, const float RHO_TOP_OC, const float minus_g_cmb, const int GRAVITY, const int NSPEC2D_BOTTOM_CM);
@@ -15,7 +15,7 @@ __global__ void compute_iso_kernel(const float * epsilondev_xx, const float * ep
 __global__ void compute_iso_undoatt_kernel(const float * epsilondev_xx, const float * epsilondev_yy, const float * epsilondev_xy, const float * epsilondev_xz, const float * epsilondev_yz, const float * epsilon_trace_over_3, float * mu_kl, float * kappa_kl, const int NSPEC, const float deltat, const int * d_ibool, const float * d_b_displ, const float * d_xix, const float * d_xiy, const float * d_xiz, const float * d_etax, const float * d_etay, const float * d_etaz, const float * d_gammax, const float * d_gammay, const float * d_gammaz, const float * d_hprime_xx);
 __global__ void compute_kappa_mu_hess_kernel(const int * d_ibool, const float * d_veloc, const float * d_b_veloc, const float * d_xix, const float * d_xiy, const float * d_xiz, const float * d_etax, const float * d_etay, const float * d_etaz, const float * d_gammax, const float * d_gammay, const float * d_gammaz, const float * d_hprime_xx, const float deltat, float * hess_rho_kl, float * hess_kappa_kl, float * hess_mu_kl, const int NSPEC, const int USE_SOURCE_RECEIVER_HESSIAN);
 __global__ void compute_rho_kernel(const int * ibool, const float * accel, const float * b_displ, float * rho_kl, const int NSPEC, const float deltat);
-__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ);
+__global__ void compute_seismograms_kernel(const int nrec_local, const float * displ, const int * d_ibool, const float * hxir, const float * hetar, const float * hgammar, float * seismograms, const float * nu, const int * ispec_selected_rec, const int * number_receiver_global, const float scale_displ, const int seismo_current);
 __global__ void compute_stacey_acoustic_backward_kernel(float * b_potential_dot_dot_acoustic, const float * b_absorb_potential, const int num_abs_boundary_faces, const int * abs_boundary_ispec, const int * abs_boundary_npoin, const int * abs_boundary_ijk, const int * ibool);
 __global__ void compute_stacey_acoustic_kernel(const float * potential_dot_acoustic, float * potential_dot_dot_acoustic, const int num_abs_boundary_faces, const int * abs_boundary_ispec, const int * abs_boundary_npoin, const int * abs_boundary_ijk, const float * abs_boundary_jacobian2Dw, const int * ibool, const float * vpstore, const int SAVE_STACEY, float * b_absorb_potential);
 __global__ void compute_stacey_elastic_backward_kernel(float * b_accel, const float * b_absorb_field, const int num_abs_boundary_faces, const int * abs_boundary_ispec, const int * abs_boundary_npoin, const int * abs_boundary_ijk, const int * ibool);
diff --git a/src/gpu/mesh_constants_cuda.h b/src/gpu/mesh_constants_cuda.h
index cf6221bb3..6525c2dca 100644
--- a/src/gpu/mesh_constants_cuda.h
+++ b/src/gpu/mesh_constants_cuda.h
@@ -38,6 +38,8 @@
 // leads up to ~10% performance increase in OpenCL and ~1% in Cuda
 #define MANUALLY_UNROLLED_LOOPS   // uncomment to use loops
 
+/*----------------------------------------------------------------------------------------------- */
+
 // definitions
 typedef cudaEvent_t gpu_event;
 typedef cudaStream_t gpu_stream;
diff --git a/src/gpu/mesh_constants_gpu.h b/src/gpu/mesh_constants_gpu.h
index af126018d..9a62b0d2b 100644
--- a/src/gpu/mesh_constants_gpu.h
+++ b/src/gpu/mesh_constants_gpu.h
@@ -144,6 +144,9 @@ typedef double realw;
 // debug: outputs maximum and preferred work group size (for OpenCL kernels)
 #define DEBUG_KERNEL_WORK_GROUP_SIZE 0
 
+// performance timers
+#define GPU_TIMING 0
+
 // error checking after cuda function calls
 // (note: this synchronizes many calls, thus e.g. no asynchronous memcpy possible)
 #define ENABLE_VERY_SLOW_ERROR_CHECKING 0
@@ -939,10 +942,24 @@ typedef struct mesh_ {
   // sources
   // ------------------------------------------------------------------   //
   int nsources_local;
-  gpu_realw_mem d_sourcearrays;
-  gpu_double_mem d_stf_pre_compute;
-  gpu_int_mem d_islice_selected_source;
-  gpu_int_mem d_ispec_selected_source;
+  int NSTEP;
+  int NSTAGES;
+  gpu_int_mem d_ispec_selected_source_local;
+  gpu_realw_mem d_sourcearrays_local;
+  gpu_realw_mem d_stf_local;
+
+  // for LDDRK and non-undo-attenuation cases
+  int use_b_stf;
+  gpu_realw_mem d_b_stf_local;
+
+  // full NSOURCES arrays not needed anymore...
+  // we will move to store only local source arrays with full source time function traces
+  // to avoid memory copies before the compute_add_sources() kernel, as those memcopies synchronize streams and will slow down performance...
+  //gpu_realw_mem d_sourcearrays;
+  //gpu_double_mem d_stf_pre_compute;
+  //gpu_int_mem d_islice_selected_source;
+
+  gpu_int_mem d_ispec_selected_source; // needed only for pure adjoint simulation cases
 
   // ------------------------------------------------------------------   //
   // receivers
@@ -1285,6 +1302,11 @@ void exit_on_error (const char *info);
 void synchronize_mpi ();
 double get_time_val ();
 
+// event timing
+void start_timing_gpu(gpu_event* start, gpu_event* stop);
+void stop_timing_gpu(gpu_event* start, gpu_event* stop, const char* info_str);
+void stop_timing_gpu_t(gpu_event* start, gpu_event* stop, const char* info_str, realw* t);
+
 // defined in check_fields_gpu.c
 void get_free_memory (double *free_db, double *used_db, double *total_db);
 realw get_device_array_maximum_value (gpu_realw_mem d_array, int size);
diff --git a/src/gpu/mesh_constants_hip.h b/src/gpu/mesh_constants_hip.h
index 8cc94a489..f9409bbe2 100644
--- a/src/gpu/mesh_constants_hip.h
+++ b/src/gpu/mesh_constants_hip.h
@@ -35,14 +35,23 @@
 #ifdef USE_HIP
 
 // (optional) unrolling loops
-// not tested yet for hip
-#undef MANUALLY_UNROLLED_LOOPS    // default to none
+// leads up to ~3% performance increase with HIP
+#define MANUALLY_UNROLLED_LOOPS   // uncomment to use loops
 
 // AMD MI100
 #ifdef GPU_DEVICE_MI100
 #undef USE_LAUNCH_BOUNDS
 #endif
 
+// AMD MI250X
+#ifdef GPU_DEVICE_MI250
+#undef USE_LAUNCH_BOUNDS
+//#define USE_LAUNCH_BOUNDS     // will slow down kernels...
+//#define LAUNCH_MIN_BLOCKS 7
+#endif
+
+/*----------------------------------------------------------------------------------------------- */
+
 // definitions
 typedef hipEvent_t gpu_event;
 typedef hipStream_t gpu_stream;
diff --git a/src/gpu/prepare_mesh_constants_gpu.c b/src/gpu/prepare_mesh_constants_gpu.c
index ab1e82743..a902ad7f4 100644
--- a/src/gpu/prepare_mesh_constants_gpu.c
+++ b/src/gpu/prepare_mesh_constants_gpu.c
@@ -82,9 +82,12 @@ void FC_FUNC_ (prepare_constants_device,
                                           int *h_NGLLX,
                                           realw *h_hprime_xx, realw *h_hprimewgll_xx,
                                           realw *h_wgllwgll_xy, realw *h_wgllwgll_xz, realw *h_wgllwgll_yz,
-                                          int *NSOURCES, int *nsources_local,
-                                          realw *h_sourcearrays,
-                                          int *h_islice_selected_source, int *h_ispec_selected_source,
+                                          int *NSOURCES,
+                                          int *nsources_local,
+                                          realw *h_sourcearrays_local,
+                                          realw *h_stf_local, realw *h_b_stf_local,
+                                          int *h_ispec_selected_source_local,
+                                          int *h_ispec_selected_source,
                                           int *nrec, int *nrec_local,
                                           int *h_number_receiver_global,
                                           int *h_islice_selected_rec, int *h_ispec_selected_rec,
@@ -110,11 +113,18 @@ void FC_FUNC_ (prepare_constants_device,
                                           realw *deltat_f,
                                           int *GPU_ASYNC_COPY_f,
                                           double * h_hxir_store,double * h_hetar_store,double * h_hgammar_store,double * h_nu,
+                                          int* nlength_seismogram,
                                           int *SAVE_SEISMOGRAMS_STRAIN_f,
-                                          int *CUSTOM_REAL_f) {
+                                          int *CUSTOM_REAL_f,
+                                          int *USE_LDDRK_f,
+                                          int *NSTEP_f, int *NSTAGES_f) {
 
   TRACE ("prepare_constants_device");
 
+  int size,size_padded;
+  int num_blocks_x,num_blocks_y;
+  int size_block_norm,size_block_norm_strain;
+
   // allocates mesh parameter structure
   Mesh *mp = (Mesh *) malloc (sizeof (Mesh));
 
@@ -305,7 +315,7 @@ void FC_FUNC_ (prepare_constants_device,
   mp->NSPEC_INNER_CORE_STRAIN_ONLY = *NSPEC_INNER_CORE_STRAIN_ONLY;
 
   // simulation flags initialization
-  mp->use_lddrk = 0;
+  mp->use_lddrk = *USE_LDDRK_f;
   mp->save_forward = *SAVE_FORWARD_f;
   mp->absorbing_conditions = *ABSORBING_CONDITIONS_f;
   mp->oceans = *OCEANS_f;
@@ -353,16 +363,41 @@ void FC_FUNC_ (prepare_constants_device,
   }
 
   // sources
-  mp->nsources_local = *nsources_local;
+  mp->nsources_local = 0;
   if (mp->simulation_type == 1 || mp->simulation_type == 3) {
-    // not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2)
-    gpuCreateCopy_todevice_realw (&mp->d_sourcearrays, h_sourcearrays, (*NSOURCES) * NDIM * NGLL3);
+    // only add CMT source for non-noise simulations and forward/kernel simulations
+    if (mp->noise_tomography == 0) {
+      mp->nsources_local = *nsources_local;
+    }
+  }
+  mp->NSTEP = *NSTEP_f;
+  mp->NSTAGES = *NSTAGES_f;
+
+  // source arrays
+  // not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2)
+  // full NSOURCES arrays not needed anymore...
+  //gpuCreateCopy_todevice_realw (&mp->d_sourcearrays, h_sourcearrays, (*NSOURCES) * NDIM * NGLL3);
+  //gpuMalloc_double (&mp->d_stf_pre_compute, *NSOURCES);
+  //gpuCreateCopy_todevice_int (&mp->d_islice_selected_source, h_islice_selected_source, *NSOURCES);
 
+  // only needed for pure adjoint simulation cases
+  if (mp->simulation_type == 2){
+    gpuCreateCopy_todevice_int (&mp->d_ispec_selected_source, h_ispec_selected_source, *NSOURCES);
+  }
+
+  // local sources only...
+  mp->use_b_stf = 0;
+  if (mp->nsources_local > 0){
     // allocates buffer on GPU for source time function values
-    gpuMalloc_double (&mp->d_stf_pre_compute, *NSOURCES);
+    gpuCreateCopy_todevice_realw (&mp->d_sourcearrays_local, h_sourcearrays_local, mp->nsources_local * NDIM * NGLL3);
+    gpuCreateCopy_todevice_realw (&mp->d_stf_local, h_stf_local, mp->nsources_local * mp->NSTEP * mp->NSTAGES);
+    gpuCreateCopy_todevice_int (&mp->d_ispec_selected_source_local, h_ispec_selected_source_local, mp->nsources_local);
+    // additional array for LDDRK and backward stepping
+    if (mp->simulation_type == 3 && mp->use_lddrk && (! mp->undo_attenuation)){
+      mp->use_b_stf = 1;
+      gpuCreateCopy_todevice_realw (&mp->d_b_stf_local, h_b_stf_local, mp->nsources_local * mp->NSTEP * mp->NSTAGES);
+    }
   }
-  gpuCreateCopy_todevice_int (&mp->d_islice_selected_source, h_islice_selected_source, *NSOURCES);
-  gpuCreateCopy_todevice_int (&mp->d_ispec_selected_source, h_ispec_selected_source, *NSOURCES);
 
   // receiver stations
   // note that:   size (number_receiver_global) = nrec_local
@@ -371,13 +406,14 @@ void FC_FUNC_ (prepare_constants_device,
   mp->nrec_local = *nrec_local;
   if (mp->nrec_local > 0) {
     gpuCreateCopy_todevice_int (&mp->d_number_receiver_global, h_number_receiver_global, mp->nrec_local);
+
     // for seismograms
     if (mp->simulation_type == 1 || mp->simulation_type == 3 ) {
       // forward/kernel simulations
       realw * xir    = (realw *)malloc(NGLLX * mp->nrec_local*sizeof(realw));
       realw * etar   = (realw *)malloc(NGLLX * mp->nrec_local*sizeof(realw));
       realw * gammar = (realw *)malloc(NGLLX * mp->nrec_local*sizeof(realw));
-      // converts to double to realw arrays, assumes NGLLX == NGLLY == NGLLZ
+      // converts from double to realw arrays, assumes NGLLX == NGLLY == NGLLZ
       for (int i=0;i<NGLLX * mp->nrec_local;i++){
         xir[i]    = (realw)h_hxir_store[i];
         etar[i]   = (realw)h_hetar_store[i];
@@ -391,7 +427,10 @@ void FC_FUNC_ (prepare_constants_device,
       free(gammar);
 
       // local seismograms
-      gpuMalloc_realw (&mp->d_seismograms, NDIM * mp->nrec_local);
+      // full seismograms length (considering sub-sampling)
+      size = (*nlength_seismogram) * mp->nrec_local;
+      gpuMalloc_realw (&mp->d_seismograms, NDIM * size);
+      gpuMemset_realw (&mp->d_seismograms, NDIM * size, 0);
 
       // orientation
       realw* nu;
@@ -465,12 +504,7 @@ void FC_FUNC_ (prepare_constants_device,
   // buffer for norm checking at every timestamp
   int blocksize = BLOCKSIZE_TRANSFER;
 
-  int size,size_padded;
-  int num_blocks_x,num_blocks_y;
-
   // buffer for crust_mantle arrays has maximum size
-  int size_block_norm,size_block_norm_strain;
-
   // norm arrays
   size = mp->NGLOB_CRUST_MANTLE;
 
@@ -626,6 +660,7 @@ void FC_FUNC_ (prepare_constants_adjoint_device,
       if (mp->h_stf_array_adjoint == NULL) exit_on_error ("h_stf_array_adjoint not allocated\n");
     }
     gpuMalloc_realw (&mp->d_stf_array_adjoint, mp->nadj_rec_local * NDIM );
+    gpuMemset_realw (&mp->d_stf_array_adjoint, mp->nadj_rec_local * NDIM, 0);
   }
 
   // synchronizes gpu calls
@@ -1042,6 +1077,7 @@ void FC_FUNC_ (prepare_fields_absorb_device,
     // boundary buffer
     if (mp->save_stacey) {
       gpuMalloc_realw (&mp->d_absorb_buffer_crust_mantle, NDIM * NGLLSQUARE * num_abs_boundary_faces);
+      gpuMemset_realw (&mp->d_absorb_buffer_crust_mantle, NDIM * NGLLSQUARE * num_abs_boundary_faces, 0);
     }
   }
 
@@ -1068,6 +1104,7 @@ void FC_FUNC_ (prepare_fields_absorb_device,
     // boundary buffer
     if (mp->save_stacey) {
       gpuMalloc_realw (&mp->d_absorb_buffer_outer_core, NGLLSQUARE * num_abs_boundary_faces);
+      gpuMemset_realw (&mp->d_absorb_buffer_outer_core, NGLLSQUARE * num_abs_boundary_faces, 0);
     }
   }
 
@@ -1344,6 +1381,7 @@ void FC_FUNC_ (prepare_fields_noise_device,
 
     // alloc storage for the surface buffer to be copied
     gpuMalloc_realw (&mp->d_noise_surface_movie, NDIM * NGLL2 * mp->nspec2D_top_crust_mantle);
+    gpuMemset_realw (&mp->d_noise_surface_movie, NDIM * NGLL2 * mp->nspec2D_top_crust_mantle, 0);
 
   } else {
     // for global mesh: each crust/mantle slice should have at top a free surface
@@ -1352,7 +1390,9 @@ void FC_FUNC_ (prepare_fields_noise_device,
 
   // prepares noise source array
   if (mp->noise_tomography == 1) {
-    gpuCreateCopy_todevice_realw (&mp->d_noise_sourcearray, noise_sourcearray, NDIM*NGLL3 * (*NSTEP));
+    // checks with setup NSTEP
+    if (mp->NSTEP != *NSTEP){ exit_on_error("Error invalid NSTEP setup for prepare_fields_noise_device() routine"); }
+    gpuCreateCopy_todevice_realw (&mp->d_noise_sourcearray, noise_sourcearray, NDIM * NGLL3 * mp->NSTEP);
   }
 
   // prepares noise directions
@@ -1435,8 +1475,8 @@ void FC_FUNC_ (prepare_lddrk_device,
   Mesh *mp = (Mesh *) *Mesh_pointer_f;
   size_t size;
 
-  // sets flag
-  mp->use_lddrk = 1;
+  // checks flag
+  if (! mp->use_lddrk){ exit_on_error("Flag use_lddrk is not set properly for routine prepare_lddrk_device()"); }
 
   // note: we don't support yet reading initial wavefields for re-starting simulations with LDDRK.
   //       this would require to store and copy also the **_lddrk wavefields to the restart files which is not done yet.
@@ -2712,12 +2752,14 @@ void FC_FUNC_ (prepare_cleanup_device,
   //------------------------------------------
   // sources
   //------------------------------------------
-  if (mp->simulation_type == 1 || mp->simulation_type == 3) {
-    gpuFree (&mp->d_sourcearrays);
-    gpuFree (&mp->d_stf_pre_compute);
+  if (mp->nsources_local > 0){
+    gpuFree (&mp->d_sourcearrays_local);
+    gpuFree (&mp->d_stf_local);
+    gpuFree (&mp->d_ispec_selected_source_local);
+  }
+  if (mp->simulation_type == 2){
+    gpuFree (&mp->d_ispec_selected_source);
   }
-  gpuFree (&mp->d_islice_selected_source);
-  gpuFree (&mp->d_ispec_selected_source);
 
   //------------------------------------------
   // receivers
diff --git a/src/gpu/specfem3D_gpu_method_stubs.c b/src/gpu/specfem3D_gpu_method_stubs.c
index c8aad67e6..592c9a0ab 100644
--- a/src/gpu/specfem3D_gpu_method_stubs.c
+++ b/src/gpu/specfem3D_gpu_method_stubs.c
@@ -110,13 +110,13 @@ void FC_FUNC_ (check_norm_strain_from_device,
 
 void FC_FUNC_ (compute_add_sources_gpu,
                COMPUTE_ADD_SOURCES_GPU) (long *Mesh_pointer_f,
-                                         int *NSOURCESf,
-                                         double *h_stf_pre_compute) {}
+                                         int *it_f,
+                                         int *istage_f) {}
 
 void FC_FUNC_ (compute_add_sources_backward_gpu,
                COMPUTE_ADD_SOURCES_BACKWARD_GPU) (long *Mesh_pointer_f,
-                                                  int *NSOURCESf,
-                                                  double *h_stf_pre_compute) {}
+                                                  int *it_tmp_f,
+                                                  int *istage_f) {}
 
 void FC_FUNC_ (compute_add_sources_adjoint_gpu,
                COMPUTE_ADD_SOURCES_ADJOINT_GPU) (long *Mesh_pointer_f) {}
@@ -224,14 +224,13 @@ void FC_FUNC_ (resort_array,
 //
 
 void FC_FUNC_ (compute_seismograms_gpu,
-               COMPUTE_SEISMOGRAMS_GPU) (long *Mesh_pointer_f,
+               COMPUTE_SEISMOGRAMS_GPU) (long *Mesh_pointer,
                                          realw* seismograms,
-                                         int* seismo_currentf,
-                                         int* itf,
-                                         int* it_endf,
-                                         double* scale_displf,
-                                         int* NTSTEP_BETWEEN_OUTPUT_SEISMOSf,
-                                         int* NSTEPf) {}
+                                         int* seismo_current_f,
+                                         int* it_f,
+                                         int* it_end_f,
+                                         double* scale_displ_f,
+                                         int* nlength_seismogram_f) {}
 
 
 //
@@ -286,6 +285,9 @@ void FC_FUNC_ (compute_strain_gpu,
 void FC_FUNC_ (pause_for_debug,
                PAUSE_FOR_DEBUG) () {}
 
+void FC_FUNC_ (gpu_synchronize,
+               GPU_SYNCHRONIZE) () {}
+
 void FC_FUNC_ (allocate_gpu_buffer,
                ALLOCATE_GPU_BUFFER) (realw** buffer_f, int* total_size) {}
 
@@ -334,9 +336,12 @@ void FC_FUNC_ (prepare_constants_device,
                                           int *h_NGLLX,
                                           realw *h_hprime_xx, realw *h_hprimewgll_xx,
                                           realw *h_wgllwgll_xy, realw *h_wgllwgll_xz, realw *h_wgllwgll_yz,
-                                          int *NSOURCES, int *nsources_local,
-                                          realw *h_sourcearrays,
-                                          int *h_islice_selected_source, int *h_ispec_selected_source,
+                                          int *NSOURCES,
+                                          int *nsources_local,
+                                          realw *h_sourcearrays_local,
+                                          realw *h_stf_local, realw *h_b_stf_local,
+                                          int *h_ispec_selected_source_local,
+                                          int *h_ispec_selected_source,
                                           int *nrec, int *nrec_local,
                                           int *h_number_receiver_global,
                                           int *h_islice_selected_rec, int *h_ispec_selected_rec,
@@ -362,8 +367,11 @@ void FC_FUNC_ (prepare_constants_device,
                                           realw *deltat_f,
                                           int *GPU_ASYNC_COPY_f,
                                           double * h_hxir_store,double * h_hetar_store,double * h_hgammar_store,double * h_nu,
+                                          int* nlength_seismogram,
                                           int *SAVE_SEISMOGRAMS_STRAIN_f,
-                                          int *CUSTOM_REAL_f) {}
+                                          int *CUSTOM_REAL_f,
+                                          int *USE_LDDRK_f,
+                                          int *NSTEP_f, int *NSTAGES_f) {}
 
 void FC_FUNC_ (prepare_constants_adjoint_device,
                PREPARE_CONSTANTS_ADJOINT_DEVICE) (long *Mesh_pointer_f,
diff --git a/src/gpu/update_displacement_LDDRK_gpu.c b/src/gpu/update_displacement_LDDRK_gpu.c
index 59347e3ce..b157050f4 100644
--- a/src/gpu/update_displacement_LDDRK_gpu.c
+++ b/src/gpu/update_displacement_LDDRK_gpu.c
@@ -83,6 +83,10 @@ void FC_FUNC_ (update_displ_lddrk_gpu,
     accel_ic = mp->d_b_accel_inner_core;
   }
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
   gpuMemset_realw (&accel_cm, size_cm, 0);
   gpuMemset_realw (&accel_oc, size_oc, 0);
   gpuMemset_realw (&accel_ic, size_ic, 0);
@@ -136,6 +140,9 @@ void FC_FUNC_ (update_displ_lddrk_gpu,
 #endif
   */
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_displ_lddrk_gpu"); }
+
   GPU_ERROR_CHECKING ("update_displ_lddrk_gpu");
 }
 
@@ -196,6 +203,10 @@ void FC_FUNC_ (update_elastic_lddrk_gpu,
     deltat = mp->b_deltat;
   }
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -383,6 +394,9 @@ void FC_FUNC_ (update_elastic_lddrk_gpu,
   }
 #endif
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_elastic_lddrk_gpu"); }
+
   GPU_ERROR_CHECKING ("after update_elastic_lddrk_gpu");
 }
 
@@ -439,6 +453,10 @@ void FC_FUNC_ (update_acoustic_lddrk_gpu,
     deltat = mp->b_deltat;
   }
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -534,5 +552,8 @@ void FC_FUNC_ (update_acoustic_lddrk_gpu,
   }
 #endif
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_acoustic_lddrk_gpu"); }
+
   GPU_ERROR_CHECKING ("after update_acoustic_lddrk_gpu");
 }
diff --git a/src/gpu/update_displacement_gpu.c b/src/gpu/update_displacement_gpu.c
index 5033a3114..908b7e13b 100644
--- a/src/gpu/update_displacement_gpu.c
+++ b/src/gpu/update_displacement_gpu.c
@@ -92,6 +92,10 @@ void FC_FUNC_ (update_displacement_ic_gpu,
     accel = mp->d_b_accel_inner_core;
   }
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -142,6 +146,9 @@ void FC_FUNC_ (update_displacement_ic_gpu,
   }
 #endif
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_displacement_ic_gpu"); }
+
   GPU_ERROR_CHECKING ("update_displacement_ic_gpu");
 }
 
@@ -210,6 +217,10 @@ void FC_FUNC_ (update_displacement_cm_gpu,
     accel = mp->d_b_accel_crust_mantle;
   }
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -260,6 +271,9 @@ void FC_FUNC_ (update_displacement_cm_gpu,
   }
 #endif
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_displacement_cm_gpu"); }
+
   GPU_ERROR_CHECKING ("update_displacement_cm_gpu");
 }
 
@@ -328,6 +342,10 @@ void FC_FUNC_ (update_displacement_oc_gpu,
     accel = mp->d_b_accel_outer_core;
   }
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -378,6 +396,9 @@ void FC_FUNC_ (update_displacement_oc_gpu,
   }
 #endif
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_displacement_oc_gpu"); }
+
   GPU_ERROR_CHECKING ("update_displacement_oc_gpu");
 }
 
@@ -442,6 +463,9 @@ void FC_FUNC_ (multiply_accel_elastic_gpu,
   }
 #endif
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
 
 #ifdef USE_OPENCL
   if (run_opencl) {
@@ -590,6 +614,9 @@ void FC_FUNC_ (multiply_accel_elastic_gpu,
 #endif
   }
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"multiply_accel_elastic_gpu"); }
+
   GPU_ERROR_CHECKING ("after multiply_accel_elastic_gpu");
 }
 
@@ -638,6 +665,10 @@ void FC_FUNC_ (update_veloc_elastic_gpu,
     accel = mp->d_b_accel_crust_mantle;
   }
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -797,6 +828,9 @@ void FC_FUNC_ (update_veloc_elastic_gpu,
 #endif
 #endif
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_veloc_3_b"); }
+
   GPU_ERROR_CHECKING ("after update_veloc_3_b");
 }
 
@@ -851,6 +885,10 @@ void FC_FUNC_ (multiply_accel_acoustic_gpu,
   }
 #endif
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -901,6 +939,9 @@ void FC_FUNC_ (multiply_accel_acoustic_gpu,
   }
 #endif
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"multiply_accel_acoustic_gpu"); }
+
   GPU_ERROR_CHECKING ("after multiply_accel_acoustic_gpu");
 }
 
@@ -949,6 +990,10 @@ void FC_FUNC_ (update_veloc_acoustic_gpu,
     accel = mp->d_b_accel_outer_core;
   }
 
+  // kernel timing
+  gpu_event start,stop;
+  if (GPU_TIMING){ start_timing_gpu(&start,&stop); }
+
 #ifdef USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[2];
@@ -1038,5 +1083,8 @@ void FC_FUNC_ (update_veloc_acoustic_gpu,
 #endif
 #endif
 
+  // kernel timing
+  if (GPU_TIMING){ stop_timing_gpu(&start,&stop,"update_veloc_acoustic_gpu"); }
+
   GPU_ERROR_CHECKING ("after update_veloc_acoustic_gpu");
 }
diff --git a/src/meshfem3D/create_central_cube.f90 b/src/meshfem3D/create_central_cube.f90
index 6a7a258d3..63d9343c7 100644
--- a/src/meshfem3D/create_central_cube.f90
+++ b/src/meshfem3D/create_central_cube.f90
@@ -41,7 +41,7 @@ subroutine create_central_cube(ichunk,ispec_count,ipass, &
     CHUNK_AC,CHUNK_AC_ANTIPODE, &
     CHUNK_BC,CHUNK_BC_ANTIPODE
 
-  use shared_parameters, only: ratio_divide_central_cube,R_PLANET
+  use shared_parameters, only: ratio_divide_central_cube,R_PLANET,NPROCTOT
 
   use meshfem_par, only: &
     xstore,ystore,zstore
@@ -58,7 +58,7 @@ subroutine create_central_cube(ichunk,ispec_count,ipass, &
 
   integer, intent(inout) :: ispec_count
 
-! correct number of spectral elements in each block depending on chunk type
+  ! correct number of spectral elements in each block depending on chunk type
   integer, intent(in) :: nspec
   integer, intent(in) :: NEX_XI,NEX_PER_PROC_XI,NEX_PER_PROC_ETA
   integer, intent(in) :: iproc_xi,iproc_eta
@@ -66,10 +66,10 @@ subroutine create_central_cube(ichunk,ispec_count,ipass, &
 
   double precision, intent(in) :: R_CENTRAL_CUBE
 
-! MPI cut-planes parameters along xi and along eta
+  ! MPI cut-planes parameters along xi and along eta
   logical, dimension(2,nspec), intent(inout) :: iMPIcut_xi,iMPIcut_eta
 
-! code for the four regions of the mesh
+  ! code for the four regions of the mesh
   integer, intent(in) :: iregion_code
 
   integer, intent(inout) :: idoubling(nspec)
@@ -164,20 +164,20 @@ subroutine create_central_cube(ichunk,ispec_count,ipass, &
         ! new get_flag_boundaries
         ! xmin & xmax
         if (ix == 0) then
-          iMPIcut_xi(1,ispec_count) = .true.
+          if (NPROCTOT > 1) iMPIcut_xi(1,ispec_count) = .true.
           if (iproc_xi == 0) iboun(1,ispec_count)= .true.
         endif
         if (ix == 2*nx_central_cube-2) then
-          iMPIcut_xi(2,ispec_count) = .true.
+          if (NPROCTOT > 1) iMPIcut_xi(2,ispec_count) = .true.
           if (iproc_xi == NPROC_XI-1) iboun(2,ispec_count)= .true.
         endif
         ! ymin & ymax
         if (iy == 0) then
-          iMPIcut_eta(1,ispec_count) = .true.
+          if (NPROCTOT > 1) iMPIcut_eta(1,ispec_count) = .true.
           if (iproc_eta == 0) iboun(3,ispec_count)= .true.
         endif
         if (iy == 2*ny_central_cube-2) then
-          iMPIcut_eta(2,ispec_count) = .true.
+          if (NPROCTOT > 1) iMPIcut_eta(2,ispec_count) = .true.
           if (iproc_eta == NPROC_ETA-1) iboun(4,ispec_count)= .true.
         endif
 
diff --git a/src/meshfem3D/create_doubling_elements.f90 b/src/meshfem3D/create_doubling_elements.f90
index 250f31d68..a8efb0139 100644
--- a/src/meshfem3D/create_doubling_elements.f90
+++ b/src/meshfem3D/create_doubling_elements.f90
@@ -51,7 +51,7 @@ subroutine create_doubling_elements(ilayer,ichunk,ispec_count,ipass, &
     IREGION_CRUST_MANTLE,IREGION_OUTER_CORE, &
     SAVE_BOUNDARY_MESH
 
-  use shared_parameters, only: ner_mesh_layers,ratio_sampling_array,doubling_index,r_bottom,r_top
+  use shared_parameters, only: ner_mesh_layers,ratio_sampling_array,doubling_index,r_bottom,r_top,NPROCTOT
 
   use meshfem_models_par, only: HONOR_1D_SPHERICAL_MOHO
 
@@ -308,20 +308,20 @@ subroutine create_doubling_elements(ilayer,ichunk,ispec_count,ipass, &
         ! new get_flag_boundaries
         ! xmin & xmax
         if (ix_elem == 1) then
-            iMPIcut_xi(1,ispec_loc) = iboun_sb(ispec_superbrick,1)
+            if (NPROCTOT > 1) iMPIcut_xi(1,ispec_loc) = iboun_sb(ispec_superbrick,1)
             if (iproc_xi == 0) iboun(1,ispec_loc)= iboun_sb(ispec_superbrick,1)
         endif
         if (ix_elem == (NEX_PER_PROC_XI-step_mult*ratio_sampling_array(ilayer)+1)) then
-            iMPIcut_xi(2,ispec_loc) = iboun_sb(ispec_superbrick,2)
+            if (NPROCTOT > 1) iMPIcut_xi(2,ispec_loc) = iboun_sb(ispec_superbrick,2)
             if (iproc_xi == NPROC_XI-1) iboun(2,ispec_loc)= iboun_sb(ispec_superbrick,2)
         endif
         !! ymin & ymax
         if (iy_elem == 1) then
-            iMPIcut_eta(1,ispec_loc) = iboun_sb(ispec_superbrick,3)
+            if (NPROCTOT > 1) iMPIcut_eta(1,ispec_loc) = iboun_sb(ispec_superbrick,3)
             if (iproc_eta == 0) iboun(3,ispec_loc)= iboun_sb(ispec_superbrick,3)
         endif
         if (iy_elem == (NEX_PER_PROC_ETA-step_mult*ratio_sampling_array(ilayer)+1)) then
-            iMPIcut_eta(2,ispec_loc) = iboun_sb(ispec_superbrick,4)
+            if (NPROCTOT > 1) iMPIcut_eta(2,ispec_loc) = iboun_sb(ispec_superbrick,4)
             if (iproc_eta == NPROC_ETA-1) iboun(4,ispec_loc)= iboun_sb(ispec_superbrick,4)
         endif
         ! zmax only
diff --git a/src/meshfem3D/create_regions_mesh.F90 b/src/meshfem3D/create_regions_mesh.F90
index 6fcda0876..59fb8235b 100644
--- a/src/meshfem3D/create_regions_mesh.F90
+++ b/src/meshfem3D/create_regions_mesh.F90
@@ -809,7 +809,6 @@ subroutine crm_allocate_arrays(ipass, &
   allocate(iMPIcut_xi(2,nspec), &
            iMPIcut_eta(2,nspec),stat=ier)
   if (ier /= 0) stop 'Error in allocate 15'
-
   iMPIcut_xi(:,:) = .false.; iMPIcut_eta(:,:) = .false.
 
   ! MPI buffer indices
@@ -1267,7 +1266,8 @@ subroutine crm_setup_mpi_buffers(npointot)
     xstore,ystore,zstore, &
     NSPEC1D_RADIAL_CORNER,NGLOB1D_RADIAL_CORNER, &
     NSPEC2D_XI_FACE,NSPEC2D_ETA_FACE, &
-    NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX
+    NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX, &
+    NPROCTOT
 
   use regions_mesh_par2, only: prname,iMPIcut_xi,iMPIcut_eta
 
@@ -1290,7 +1290,7 @@ subroutine crm_setup_mpi_buffers(npointot)
   iboolright_xi(:) = 0
   iboolright_eta(:) = 0
 
-  if (nspec > 0) then
+  if (nspec > 0 .and. NPROCTOT > 1) then
     ! arrays mask_ibool(npointot) used to save memory
     ! allocate memory for arrays
     allocate(mask_ibool(npointot),stat=ier)
diff --git a/src/meshfem3D/create_regular_elements.f90 b/src/meshfem3D/create_regular_elements.f90
index 1de56f7b8..15cfba6e6 100644
--- a/src/meshfem3D/create_regular_elements.f90
+++ b/src/meshfem3D/create_regular_elements.f90
@@ -49,7 +49,7 @@ subroutine create_regular_elements(ilayer,ichunk,ispec_count,ipass, &
   use constants, only: myrank,NDIM,CUSTOM_REAL,NGLLX,NGLLY,NGNOD,NGNOD_EIGHT_CORNERS,SUPPRESS_CRUSTAL_MESH, &
     SAVE_BOUNDARY_MESH,IREGION_CRUST_MANTLE,IMAIN
 
-  use shared_parameters, only: ner_mesh_layers,ratio_sampling_array,doubling_index,r_bottom,r_top
+  use shared_parameters, only: ner_mesh_layers,ratio_sampling_array,doubling_index,r_bottom,r_top,NPROCTOT
 
   use meshfem_models_par, only: HONOR_1D_SPHERICAL_MOHO,CASE_3D
 
@@ -240,20 +240,20 @@ subroutine create_regular_elements(ilayer,ichunk,ispec_count,ipass, &
         ! new get_flag_boundaries
         ! xmin & xmax
         if (ix_elem == 1) then
-          iMPIcut_xi(1,ispec_loc) = .true.
+          if (NPROCTOT > 1) iMPIcut_xi(1,ispec_loc) = .true.
           if (iproc_xi == 0) iboun(1,ispec_loc) = .true.
         endif
         if (ix_elem == (NEX_PER_PROC_XI-ratio_sampling_array(ilayer)+1)) then
-          iMPIcut_xi(2,ispec_loc) = .true.
+          if (NPROCTOT > 1) iMPIcut_xi(2,ispec_loc) = .true.
           if (iproc_xi == NPROC_XI-1) iboun(2,ispec_loc) = .true.
         endif
         ! ymin & ymax
         if (iy_elem == 1) then
-          iMPIcut_eta(1,ispec_loc) = .true.
+          if (NPROCTOT > 1) iMPIcut_eta(1,ispec_loc) = .true.
           if (iproc_eta == 0) iboun(3,ispec_loc) = .true.
         endif
         if (iy_elem == (NEX_PER_PROC_ETA-ratio_sampling_array(ilayer)+1)) then
-          iMPIcut_eta(2,ispec_loc) = .true.
+          if (NPROCTOT > 1) iMPIcut_eta(2,ispec_loc) = .true.
           if (iproc_eta == NPROC_ETA-1) iboun(4,ispec_loc) = .true.
         endif
         ! zmin & zmax
diff --git a/src/meshfem3D/fix_non_blocking_flags.f90 b/src/meshfem3D/fix_non_blocking_flags.f90
index c33e8a2f3..dfef9665a 100644
--- a/src/meshfem3D/fix_non_blocking_flags.f90
+++ b/src/meshfem3D/fix_non_blocking_flags.f90
@@ -37,27 +37,27 @@ subroutine fix_non_blocking_slices(is_on_a_slice_edge,iboolright_xi, &
 
   implicit none
 
-  integer :: nspec,nglob,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX
+  integer, intent(in) :: nspec,nglob,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX
 
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi,npoin2D_eta
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR), intent(in) :: npoin2D_xi,npoin2D_eta
 
-  logical, dimension(nspec) :: is_on_a_slice_edge
+  logical, dimension(nspec), intent(inout) :: is_on_a_slice_edge
 
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX) :: iboolleft_xi,iboolright_xi
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX) :: iboolleft_eta,iboolright_eta
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX), intent(in) :: iboolleft_xi,iboolright_xi
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX), intent(in) :: iboolleft_eta,iboolright_eta
 
-  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec), intent(in) :: ibool
 
   ! local parameters
   logical, dimension(:),allocatable :: mask_ibool
   integer :: ipoin,ispec,i,j,k,ier
 
-! clean the mask
+  ! clean the mask
   allocate(mask_ibool(nglob),stat=ier)
   if (ier /= 0) stop 'Error allocating mask in fix_non_blocking_slices routine'
   mask_ibool(:) = .false.
 
-! mark all the points that are in the MPI buffers to assemble inside each chunk
+  ! mark all the points that are in the MPI buffers to assemble inside each chunk
   do ipoin = 1,npoin2D_xi(1)
     mask_ibool(iboolleft_xi(ipoin)) = .true.
   enddo
@@ -74,11 +74,11 @@ subroutine fix_non_blocking_slices(is_on_a_slice_edge,iboolright_xi, &
     mask_ibool(iboolright_eta(ipoin)) = .true.
   enddo
 
-! now label all the elements that have at least one corner belonging
-! to any of these buffers as elements that must contribute to the
-! first step of the calculations (performed on the edges before starting
-! the non blocking communications); there is no need to examine the inside
-! of the elements, checking their eight corners is sufficient
+  ! now label all the elements that have at least one corner belonging
+  ! to any of these buffers as elements that must contribute to the
+  ! first step of the calculations (performed on the edges before starting
+  ! the non blocking communications); there is no need to examine the inside
+  ! of the elements, checking their eight corners is sufficient
   do ispec = 1,nspec
     do k = 1,NGLLZ,NGLLZ-1
       do j = 1,NGLLY,NGLLY-1
@@ -114,18 +114,18 @@ subroutine fix_non_blocking_central_cube(is_on_a_slice_edge, &
 
   implicit none
 
-  integer,intent(in) :: nspec,nglob,nb_msgs_theor_in_cube,NSPEC2D_BOTTOM_INNER_CORE
-  integer,intent(in) :: ichunk,npoin2D_cube_from_slices,NPROC_XI
+  integer, intent(in) :: nspec,nglob,nb_msgs_theor_in_cube,NSPEC2D_BOTTOM_INNER_CORE
+  integer, intent(in) :: ichunk,npoin2D_cube_from_slices,NPROC_XI
 
-  logical, dimension(nspec),intent(inout) :: is_on_a_slice_edge
+  logical, dimension(nspec), intent(inout) :: is_on_a_slice_edge
 
-  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec),intent(in) :: ibool
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec), intent(in) :: ibool
 
-  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices),intent(in) :: ibool_central_cube
+  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices), intent(in) :: ibool_central_cube
 
-  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE),intent(in) :: ibelm_bottom_inner_core
+  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE), intent(in) :: ibelm_bottom_inner_core
 
-  integer, dimension(nspec),intent(in) :: idoubling_inner_core
+  integer, dimension(nspec), intent(in) :: idoubling_inner_core
 
   ! local parameters
   logical, dimension(:),allocatable :: mask_ibool
diff --git a/src/meshfem3D/get_MPI_1D_buffers.f90 b/src/meshfem3D/get_MPI_1D_buffers.f90
index 94207b1e8..34186dbfb 100644
--- a/src/meshfem3D/get_MPI_1D_buffers.f90
+++ b/src/meshfem3D/get_MPI_1D_buffers.f90
@@ -243,9 +243,7 @@ subroutine get_MPI_1D_buffers(prname,nspec,iMPIcut_xi,iMPIcut_eta, &
 
     ! corner detection here
     if (iMPIcut_xi(1,ispec) .and. iMPIcut_eta(2,ispec)) then
-
       ispeccount = ispeccount + 1
-
       ! loop on all the points
       ix = 1
       iy = NGLLY
@@ -310,14 +308,11 @@ subroutine get_MPI_1D_buffers(prname,nspec,iMPIcut_xi,iMPIcut_eta, &
 
     ! corner detection here
     if (iMPIcut_xi(2,ispec) .and. iMPIcut_eta(2,ispec)) then
-
       ispeccount = ispeccount + 1
-
       ! loop on all the points
       ix = NGLLX
       iy = NGLLY
       do iz = 1,NGLLZ
-
         ! select point, if not already selected
         if (.not. mask_ibool(ibool(ix,iy,iz,ispec))) then
           mask_ibool(ibool(ix,iy,iz,ispec)) = .true.
diff --git a/src/meshfem3D/get_MPI_cutplanes_xi.f90 b/src/meshfem3D/get_MPI_cutplanes_xi.f90
index 4684f1760..bd3d93eac 100644
--- a/src/meshfem3D/get_MPI_cutplanes_xi.f90
+++ b/src/meshfem3D/get_MPI_cutplanes_xi.f90
@@ -125,24 +125,24 @@ subroutine get_MPI_cutplanes_xi(prname,nspec,iMPIcut_xi,ibool, &
       ! loop on all the points in that 2-D element, including edges
       ix = 1
       do iy = 1,NGLLY
-          do iz = 1,NGLLZ
-            ! select point, if not already selected
-            if (.not. mask_ibool(ibool(ix,iy,iz,ispec))) then
-              mask_ibool(ibool(ix,iy,iz,ispec)) = .true.
-              npoin2D_xi = npoin2D_xi + 1
-
-              ! fills buffer arrays
-              iboolleft_xi(npoin2D_xi) = ibool(ix,iy,iz,ispec)
-
-              npoin2D_xi_all(1) = npoin2D_xi_all(1) + 1
-
-              ! debug file output
-              if (DEBUG) then
-                write(IOUT,*) ibool(ix,iy,iz,ispec), xstore(ix,iy,iz,ispec), &
-                              ystore(ix,iy,iz,ispec),zstore(ix,iy,iz,ispec)
-              endif
+        do iz = 1,NGLLZ
+          ! select point, if not already selected
+          if (.not. mask_ibool(ibool(ix,iy,iz,ispec))) then
+            mask_ibool(ibool(ix,iy,iz,ispec)) = .true.
+            npoin2D_xi = npoin2D_xi + 1
+
+            ! fills buffer arrays
+            iboolleft_xi(npoin2D_xi) = ibool(ix,iy,iz,ispec)
+
+            npoin2D_xi_all(1) = npoin2D_xi_all(1) + 1
+
+            ! debug file output
+            if (DEBUG) then
+              write(IOUT,*) ibool(ix,iy,iz,ispec), xstore(ix,iy,iz,ispec), &
+                            ystore(ix,iy,iz,ispec),zstore(ix,iy,iz,ispec)
             endif
-          enddo
+          endif
+        enddo
       enddo
     endif
   enddo
diff --git a/src/meshfem3D/get_jacobian_boundaries.f90 b/src/meshfem3D/get_jacobian_boundaries.f90
index 2f61c4730..c85ba40b7 100644
--- a/src/meshfem3D/get_jacobian_boundaries.f90
+++ b/src/meshfem3D/get_jacobian_boundaries.f90
@@ -109,8 +109,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, &
 
     if (iboun(1,ispec)) then
 
-      ispecb1=ispecb1+1
-      ibelm_xmin(ispecb1)=ispec
+      ispecb1 = ispecb1+1
+      ibelm_xmin(ispecb1) = ispec
 
       if (.not. USE_GLL) then
           !   specify the 9 nodes for the 2-D boundary element
@@ -164,8 +164,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, &
 
     if (iboun(2,ispec)) then
 
-      ispecb2=ispecb2+1
-      ibelm_xmax(ispecb2)=ispec
+      ispecb2 = ispecb2+1
+      ibelm_xmax(ispecb2) = ispec
 
       if (.not. USE_GLL) then
           !   specify the 9 nodes for the 2-D boundary element
@@ -220,8 +220,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, &
 
     if (iboun(3,ispec)) then
 
-      ispecb3=ispecb3+1
-      ibelm_ymin(ispecb3)=ispec
+      ispecb3 = ispecb3+1
+      ibelm_ymin(ispecb3) = ispec
 
       if (.not. USE_GLL) then
           !   specify the 9 nodes for the 2-D boundary element
@@ -276,8 +276,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, &
 
     if (iboun(4,ispec)) then
 
-      ispecb4=ispecb4+1
-      ibelm_ymax(ispecb4)=ispec
+      ispecb4 = ispecb4+1
+      ibelm_ymax(ispecb4) = ispec
 
       if (.not. USE_GLL) then
           !   specify the 9 nodes for the 2-D boundary element
@@ -332,8 +332,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, &
 
     if (iboun(5,ispec)) then
 
-      ispecb5=ispecb5+1
-      ibelm_bottom(ispecb5)=ispec
+      ispecb5 = ispecb5+1
+      ibelm_bottom(ispecb5) = ispec
 
       if (.not. USE_GLL) then
           xelm(1)=xstore(1,1,1,ispec)
@@ -388,8 +388,8 @@ subroutine get_jacobian_boundaries(iboun,xstore,ystore,zstore, &
 
     if (iboun(6,ispec)) then
 
-      ispecb6=ispecb6+1
-      ibelm_top(ispecb6)=ispec
+      ispecb6 = ispecb6+1
+      ibelm_top(ispecb6) = ispec
 
       if (.not. USE_GLL) then
           xelm(1)=xstore(1,1,NGLLZ,ispec)
diff --git a/src/meshfem3D/meshfem3D_models.F90 b/src/meshfem3D/meshfem3D_models.F90
index 242b4b02b..1b702d5ac 100644
--- a/src/meshfem3D/meshfem3D_models.F90
+++ b/src/meshfem3D/meshfem3D_models.F90
@@ -1548,7 +1548,7 @@ subroutine meshfem3D_models_getatten_val(idoubling,r_prem,theta,phi, &
   use model_prem_par, only: PREM_RMOHO
 
   use shared_parameters, only: ABSORB_USING_GLOBAL_SPONGE, &
-          SPONGE_LATITUDE_IN_DEGREES,SPONGE_LONGITUDE_IN_DEGREES,SPONGE_RADIUS_IN_DEGREES
+    SPONGE_LATITUDE_IN_DEGREES,SPONGE_LONGITUDE_IN_DEGREES,SPONGE_RADIUS_IN_DEGREES
 
   implicit none
 
diff --git a/src/meshfem3D/model_EMC.f90 b/src/meshfem3D/model_EMC.f90
index 4e62e3df9..8ab7ef3f4 100644
--- a/src/meshfem3D/model_EMC.f90
+++ b/src/meshfem3D/model_EMC.f90
@@ -98,7 +98,7 @@ module model_emc_par
   !       or if partial EMC models are just not supported.
   !
   !       by default, we will use Brocher scaling relations:
-  !       Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth’s Crust, BSSA
+  !       Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth's Crust, BSSA
 
   !       note that these scaling relations also would have limitations (range of vs, etc.) in their applicability,
   !       which we ignore for simplicity now.
@@ -114,7 +114,7 @@ module model_emc_par
   ! (see also Qin et al. 2009, Reliability of mantle tomography models assessed by spectral element simulation, sec. 5.2)
   !double precision, parameter :: SCALE_RHO = 0.40d0
   ! SCEC CVM-H version model relationship: https://strike.scec.org/scecpedia/CVM-H,
-  ! uses Brocher 2005 (Empirical Relations between Elastic Wavespeeds and Density in the Earth’s Crust, BSSA) for sediments
+  ! uses Brocher 2005 (Empirical Relations between Elastic Wavespeeds and Density in the Earth's Crust, BSSA) for sediments
   !double precision, parameter :: SCALE_RHO = 0.254d0
   !
   !! other factors to convert (perturbations in) shear speed to (perturbations in) Vp
@@ -631,7 +631,7 @@ subroutine scale_Brocher_rho_from_vp()
   double precision :: vp,vp_p2,vp_p3,vp_p4,vp_p5,rho
   integer :: ix,iy,iz,nx,ny,nz
 
-  ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth’s Crust, BSSA
+  ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth's Crust, BSSA
   ! factors from eq. (1)
   double precision,parameter :: fac1 = 1.6612d0
   double precision,parameter :: fac2 = -0.4721d0
@@ -692,7 +692,7 @@ subroutine scale_Brocher_vs_from_vp()
   double precision :: vp,vp_p2,vp_p3,vp_p4,vs
   integer :: ix,iy,iz,nx,ny,nz
 
-  ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth’s Crust, BSSA
+  ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth's Crust, BSSA
   ! factors from eq. (1)
   double precision,parameter :: fac1 = 0.7858d0
   double precision,parameter :: fac2 = -1.2344d0
@@ -763,7 +763,7 @@ subroutine scale_Brocher_vp_from_vs()
   double precision :: vs,vs_p2,vs_p3,vs_p4,vp
   integer :: ix,iy,iz,nx,ny,nz
 
-  ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth’s Crust, BSSA
+  ! Brocher 2005, Empirical Relations between Elastic Wavespeeds and Density in the Earth's Crust, BSSA
   ! factors from eq. (1)
   double precision,parameter :: fac1 = 0.9409d0
   double precision,parameter :: fac2 = 2.0947d0
diff --git a/src/meshfem3D/setup_inner_outer.f90 b/src/meshfem3D/setup_inner_outer.f90
index f8cf49a97..d52dd7528 100644
--- a/src/meshfem3D/setup_inner_outer.f90
+++ b/src/meshfem3D/setup_inner_outer.f90
@@ -30,7 +30,8 @@ subroutine setup_inner_outer(iregion_code)
 
   use meshfem_par, only: &
     myrank,OUTPUT_FILES,IMAIN, &
-    IREGION_CRUST_MANTLE,IREGION_OUTER_CORE,IREGION_INNER_CORE,MAX_STRING_LEN
+    IREGION_CRUST_MANTLE,IREGION_OUTER_CORE,IREGION_INNER_CORE,MAX_STRING_LEN, &
+    NPROCTOT
 
   use meshfem_par, only: ibool,is_on_a_slice_edge,xstore_glob,ystore_glob,zstore_glob
 
@@ -38,9 +39,6 @@ subroutine setup_inner_outer(iregion_code)
   use MPI_outer_core_par
   use MPI_inner_core_par
 
-!debug
-!  use shared_parameters, only: NPROCTOT
-
   implicit none
 
   integer,intent(in) :: iregion_code
@@ -59,7 +57,11 @@ subroutine setup_inner_outer(iregion_code)
   select case (iregion_code)
   case (IREGION_CRUST_MANTLE)
     ! crust_mantle
-    nspec_outer_crust_mantle = count( is_on_a_slice_edge )
+    if (NPROCTOT > 1) then
+      nspec_outer_crust_mantle = count( is_on_a_slice_edge )
+    else
+      nspec_outer_crust_mantle = 0
+    endif
     nspec_inner_crust_mantle = NSPEC_CRUST_MANTLE - nspec_outer_crust_mantle
 
     num_phase_ispec_crust_mantle = max(nspec_inner_crust_mantle,nspec_outer_crust_mantle)
@@ -71,7 +73,7 @@ subroutine setup_inner_outer(iregion_code)
     iinner = 0
     iouter = 0
     do ispec = 1,NSPEC_CRUST_MANTLE
-      if (is_on_a_slice_edge(ispec)) then
+      if (is_on_a_slice_edge(ispec) .and. NPROCTOT > 1) then
         ! outer element
         iouter = iouter + 1
         phase_ispec_inner_crust_mantle(iouter,1) = ispec
@@ -113,7 +115,11 @@ subroutine setup_inner_outer(iregion_code)
 
   case (IREGION_OUTER_CORE)
     ! outer_core
-    nspec_outer_outer_core = count( is_on_a_slice_edge )
+    if (NPROCTOT > 1) then
+      nspec_outer_outer_core = count( is_on_a_slice_edge )
+    else
+      nspec_outer_outer_core = 0
+    endif
     nspec_inner_outer_core = NSPEC_OUTER_CORE - nspec_outer_outer_core
 
     num_phase_ispec_outer_core = max(nspec_inner_outer_core,nspec_outer_outer_core)
@@ -125,7 +131,7 @@ subroutine setup_inner_outer(iregion_code)
     iinner = 0
     iouter = 0
     do ispec = 1,NSPEC_OUTER_CORE
-      if (is_on_a_slice_edge(ispec)) then
+      if (is_on_a_slice_edge(ispec) .and. NPROCTOT > 1) then
         ! outer element
         iouter = iouter + 1
         phase_ispec_inner_outer_core(iouter,1) = ispec
@@ -154,7 +160,11 @@ subroutine setup_inner_outer(iregion_code)
 
   case (IREGION_INNER_CORE)
     ! inner_core
-    nspec_outer_inner_core = count( is_on_a_slice_edge )
+    if (NPROCTOT > 1) then
+      nspec_outer_inner_core = count( is_on_a_slice_edge )
+    else
+      nspec_outer_inner_core = 0
+    endif
     nspec_inner_inner_core = NSPEC_INNER_CORE - nspec_outer_inner_core
 
     num_phase_ispec_inner_core = max(nspec_inner_inner_core,nspec_outer_inner_core)
@@ -166,7 +176,7 @@ subroutine setup_inner_outer(iregion_code)
     iinner = 0
     iouter = 0
     do ispec = 1,NSPEC_INNER_CORE
-      if (is_on_a_slice_edge(ispec)) then
+      if (is_on_a_slice_edge(ispec) .and. NPROCTOT > 1) then
         ! outer element
         iouter = iouter + 1
         phase_ispec_inner_inner_core(iouter,1) = ispec
diff --git a/src/meshfem3D/write_AVS_DX_global_faces_data.f90 b/src/meshfem3D/write_AVS_DX_global_faces_data.f90
index d32269db0..44a51cd85 100644
--- a/src/meshfem3D/write_AVS_DX_global_faces_data.f90
+++ b/src/meshfem3D/write_AVS_DX_global_faces_data.f90
@@ -90,364 +90,366 @@ subroutine write_AVS_DX_global_faces_data(prname,nspec,iMPIcut_xi,iMPIcut_eta, &
   ! processor identification
   character(len=MAX_STRING_LEN) :: prname
 
-! writing points
+  ! writing points
   open(unit=IOUT,file=prname(1:len_trim(prname))//'AVS_DXpointsfaces.txt',status='unknown')
 
-! erase the logical mask used to mark points already found
+  ! erase the logical mask used to mark points already found
   mask_ibool(:) = .false.
 
   nspecface = 0
 
-! mark global AVS or DX points
+  ! mark global AVS or DX points
   do ispec = 1,nspec
-! only if on face
-  if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. &
-              iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then
-    iglob1=ibool(1,1,1,ispec)
-    iglob2=ibool(NGLLX,1,1,ispec)
-    iglob3=ibool(NGLLX,NGLLY,1,ispec)
-    iglob4=ibool(1,NGLLY,1,ispec)
-    iglob5=ibool(1,1,NGLLZ,ispec)
-    iglob6=ibool(NGLLX,1,NGLLZ,ispec)
-    iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
-    iglob8=ibool(1,NGLLY,NGLLZ,ispec)
-
-! face xi = xi_min
-  if (iMPIcut_xi(1,ispec)) then
-    nspecface = nspecface + 1
-    mask_ibool(iglob1) = .true.
-    mask_ibool(iglob4) = .true.
-    mask_ibool(iglob8) = .true.
-    mask_ibool(iglob5) = .true.
-  endif
-
-! face xi = xi_max
-  if (iMPIcut_xi(2,ispec)) then
-    nspecface = nspecface + 1
-    mask_ibool(iglob2) = .true.
-    mask_ibool(iglob3) = .true.
-    mask_ibool(iglob7) = .true.
-    mask_ibool(iglob6) = .true.
-  endif
-
-! face eta = eta_min
-  if (iMPIcut_eta(1,ispec)) then
-    nspecface = nspecface + 1
-    mask_ibool(iglob1) = .true.
-    mask_ibool(iglob2) = .true.
-    mask_ibool(iglob6) = .true.
-    mask_ibool(iglob5) = .true.
-  endif
-
-! face eta = eta_max
-  if (iMPIcut_eta(2,ispec)) then
-    nspecface = nspecface + 1
-    mask_ibool(iglob4) = .true.
-    mask_ibool(iglob3) = .true.
-    mask_ibool(iglob7) = .true.
-    mask_ibool(iglob8) = .true.
-  endif
-
-  endif
+    ! only if on face
+    if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. &
+                iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then
+      iglob1=ibool(1,1,1,ispec)
+      iglob2=ibool(NGLLX,1,1,ispec)
+      iglob3=ibool(NGLLX,NGLLY,1,ispec)
+      iglob4=ibool(1,NGLLY,1,ispec)
+      iglob5=ibool(1,1,NGLLZ,ispec)
+      iglob6=ibool(NGLLX,1,NGLLZ,ispec)
+      iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+      iglob8=ibool(1,NGLLY,NGLLZ,ispec)
+
+      ! face xi = xi_min
+      if (iMPIcut_xi(1,ispec)) then
+        nspecface = nspecface + 1
+        mask_ibool(iglob1) = .true.
+        mask_ibool(iglob4) = .true.
+        mask_ibool(iglob8) = .true.
+        mask_ibool(iglob5) = .true.
+      endif
+
+      ! face xi = xi_max
+      if (iMPIcut_xi(2,ispec)) then
+        nspecface = nspecface + 1
+        mask_ibool(iglob2) = .true.
+        mask_ibool(iglob3) = .true.
+        mask_ibool(iglob7) = .true.
+        mask_ibool(iglob6) = .true.
+      endif
+
+      ! face eta = eta_min
+      if (iMPIcut_eta(1,ispec)) then
+        nspecface = nspecface + 1
+        mask_ibool(iglob1) = .true.
+        mask_ibool(iglob2) = .true.
+        mask_ibool(iglob6) = .true.
+        mask_ibool(iglob5) = .true.
+      endif
+
+      ! face eta = eta_max
+      if (iMPIcut_eta(2,ispec)) then
+        nspecface = nspecface + 1
+        mask_ibool(iglob4) = .true.
+        mask_ibool(iglob3) = .true.
+        mask_ibool(iglob7) = .true.
+        mask_ibool(iglob8) = .true.
+      endif
+
+    endif
   enddo
 
-! count global number of AVS or DX points
+  ! count global number of AVS or DX points
   npoin = count(mask_ibool(:))
 
-! number of points in AVS or DX file
+  ! number of points in AVS or DX file
   write(IOUT,*) npoin
 
-! erase the logical mask used to mark points already found
+  ! erase the logical mask used to mark points already found
   mask_ibool(:) = .false.
 
-! output global AVS or DX points
+  ! output global AVS or DX points
   numpoin = 0
   do ispec = 1,nspec
-! only if on face
-  if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. &
-              iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then
-    iglob1=ibool(1,1,1,ispec)
-    iglob2=ibool(NGLLX,1,1,ispec)
-    iglob3=ibool(NGLLX,NGLLY,1,ispec)
-    iglob4=ibool(1,NGLLY,1,ispec)
-    iglob5=ibool(1,1,NGLLZ,ispec)
-    iglob6=ibool(NGLLX,1,NGLLZ,ispec)
-    iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
-    iglob8=ibool(1,NGLLY,NGLLZ,ispec)
-
-! face xi = xi_min
-  if (iMPIcut_xi(1,ispec)) then
-    if (.not. mask_ibool(iglob1)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob1) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(1,1,1,ispec)), &
-                    sngl(ystore(1,1,1,ispec)),sngl(zstore(1,1,1,ispec))
-    endif
-    if (.not. mask_ibool(iglob4)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob4) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,1,ispec)), &
-                    sngl(ystore(1,NGLLY,1,ispec)),sngl(zstore(1,NGLLY,1,ispec))
-    endif
-    if (.not. mask_ibool(iglob8)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob8) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), &
-                    sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec))
-    endif
-    if (.not. mask_ibool(iglob5)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob5) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), &
-                    sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec))
-    endif
-    mask_ibool(iglob1) = .true.
-    mask_ibool(iglob4) = .true.
-    mask_ibool(iglob8) = .true.
-    mask_ibool(iglob5) = .true.
-  endif
-
-! face xi = xi_max
-  if (iMPIcut_xi(2,ispec)) then
-    if (.not. mask_ibool(iglob2)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob2) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,1,ispec)), &
-                    sngl(ystore(NGLLX,1,1,ispec)),sngl(zstore(NGLLX,1,1,ispec))
-    endif
-    if (.not. mask_ibool(iglob3)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob3) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,1,ispec)), &
-                    sngl(ystore(NGLLX,NGLLY,1,ispec)),sngl(zstore(NGLLX,NGLLY,1,ispec))
-    endif
-    if (.not. mask_ibool(iglob7)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob7) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), &
-                    sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec))
-    endif
-    if (.not. mask_ibool(iglob6)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob6) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), &
-                    sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec))
-    endif
-    mask_ibool(iglob2) = .true.
-    mask_ibool(iglob3) = .true.
-    mask_ibool(iglob7) = .true.
-    mask_ibool(iglob6) = .true.
-  endif
-
-! face eta = eta_min
-  if (iMPIcut_eta(1,ispec)) then
-    if (.not. mask_ibool(iglob1)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob1) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(1,1,1,ispec)), &
-                    sngl(ystore(1,1,1,ispec)),sngl(zstore(1,1,1,ispec))
-    endif
-    if (.not. mask_ibool(iglob2)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob2) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,1,ispec)), &
-                    sngl(ystore(NGLLX,1,1,ispec)),sngl(zstore(NGLLX,1,1,ispec))
-    endif
-    if (.not. mask_ibool(iglob6)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob6) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), &
-                    sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec))
-    endif
-    if (.not. mask_ibool(iglob5)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob5) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), &
-                    sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec))
-    endif
-    mask_ibool(iglob1) = .true.
-    mask_ibool(iglob2) = .true.
-    mask_ibool(iglob6) = .true.
-    mask_ibool(iglob5) = .true.
-  endif
-
-! face eta = eta_max
-  if (iMPIcut_eta(2,ispec)) then
-    if (.not. mask_ibool(iglob4)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob4) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,1,ispec)), &
-                    sngl(ystore(1,NGLLY,1,ispec)),sngl(zstore(1,NGLLY,1,ispec))
-    endif
-    if (.not. mask_ibool(iglob3)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob3) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,1,ispec)), &
-                    sngl(ystore(NGLLX,NGLLY,1,ispec)),sngl(zstore(NGLLX,NGLLY,1,ispec))
-    endif
-    if (.not. mask_ibool(iglob7)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob7) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), &
-                    sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec))
-    endif
-    if (.not. mask_ibool(iglob8)) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglob8) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), &
-                    sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec))
-    endif
-    mask_ibool(iglob4) = .true.
-    mask_ibool(iglob3) = .true.
-    mask_ibool(iglob7) = .true.
-    mask_ibool(iglob8) = .true.
-  endif
+    ! only if on face
+    if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. &
+                iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then
+      iglob1=ibool(1,1,1,ispec)
+      iglob2=ibool(NGLLX,1,1,ispec)
+      iglob3=ibool(NGLLX,NGLLY,1,ispec)
+      iglob4=ibool(1,NGLLY,1,ispec)
+      iglob5=ibool(1,1,NGLLZ,ispec)
+      iglob6=ibool(NGLLX,1,NGLLZ,ispec)
+      iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+      iglob8=ibool(1,NGLLY,NGLLZ,ispec)
+
+      ! face xi = xi_min
+      if (iMPIcut_xi(1,ispec)) then
+        if (.not. mask_ibool(iglob1)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob1) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(1,1,1,ispec)), &
+                        sngl(ystore(1,1,1,ispec)),sngl(zstore(1,1,1,ispec))
+        endif
+        if (.not. mask_ibool(iglob4)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob4) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,1,ispec)), &
+                        sngl(ystore(1,NGLLY,1,ispec)),sngl(zstore(1,NGLLY,1,ispec))
+        endif
+        if (.not. mask_ibool(iglob8)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob8) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), &
+                        sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec))
+        endif
+        if (.not. mask_ibool(iglob5)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob5) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), &
+                        sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec))
+        endif
+        mask_ibool(iglob1) = .true.
+        mask_ibool(iglob4) = .true.
+        mask_ibool(iglob8) = .true.
+        mask_ibool(iglob5) = .true.
+      endif
+
+      ! face xi = xi_max
+      if (iMPIcut_xi(2,ispec)) then
+        if (.not. mask_ibool(iglob2)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob2) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,1,ispec)), &
+                        sngl(ystore(NGLLX,1,1,ispec)),sngl(zstore(NGLLX,1,1,ispec))
+        endif
+        if (.not. mask_ibool(iglob3)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob3) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,1,ispec)), &
+                        sngl(ystore(NGLLX,NGLLY,1,ispec)),sngl(zstore(NGLLX,NGLLY,1,ispec))
+        endif
+        if (.not. mask_ibool(iglob7)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob7) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), &
+                        sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec))
+        endif
+        if (.not. mask_ibool(iglob6)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob6) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), &
+                        sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec))
+        endif
+        mask_ibool(iglob2) = .true.
+        mask_ibool(iglob3) = .true.
+        mask_ibool(iglob7) = .true.
+        mask_ibool(iglob6) = .true.
+      endif
+
+      ! face eta = eta_min
+      if (iMPIcut_eta(1,ispec)) then
+        if (.not. mask_ibool(iglob1)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob1) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(1,1,1,ispec)), &
+                        sngl(ystore(1,1,1,ispec)),sngl(zstore(1,1,1,ispec))
+        endif
+        if (.not. mask_ibool(iglob2)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob2) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,1,ispec)), &
+                        sngl(ystore(NGLLX,1,1,ispec)),sngl(zstore(NGLLX,1,1,ispec))
+        endif
+        if (.not. mask_ibool(iglob6)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob6) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), &
+                        sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec))
+        endif
+        if (.not. mask_ibool(iglob5)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob5) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), &
+                        sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec))
+        endif
+        mask_ibool(iglob1) = .true.
+        mask_ibool(iglob2) = .true.
+        mask_ibool(iglob6) = .true.
+        mask_ibool(iglob5) = .true.
+      endif
+
+      ! face eta = eta_max
+      if (iMPIcut_eta(2,ispec)) then
+        if (.not. mask_ibool(iglob4)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob4) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,1,ispec)), &
+                        sngl(ystore(1,NGLLY,1,ispec)),sngl(zstore(1,NGLLY,1,ispec))
+        endif
+        if (.not. mask_ibool(iglob3)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob3) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,1,ispec)), &
+                        sngl(ystore(NGLLX,NGLLY,1,ispec)),sngl(zstore(NGLLX,NGLLY,1,ispec))
+        endif
+        if (.not. mask_ibool(iglob7)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob7) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), &
+                        sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec))
+        endif
+        if (.not. mask_ibool(iglob8)) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglob8) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), &
+                        sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec))
+        endif
+        mask_ibool(iglob4) = .true.
+        mask_ibool(iglob3) = .true.
+        mask_ibool(iglob7) = .true.
+        mask_ibool(iglob8) = .true.
+      endif
 
-  endif
+    endif
   enddo
 
-! check that number of global points output is okay
+  ! check that number of global points output is okay
   if (numpoin /= npoin) &
     call exit_MPI(myrank,'incorrect number of global points in AVS or DX file creation')
 
   close(IOUT)
 
-! output global AVS or DX elements
+  ! output global AVS or DX elements
 
-! writing elements
+  ! writing elements
   open(unit=IOUT,file=prname(1:len_trim(prname))//'AVS_DXelementsfaces.txt',status='unknown')
- if (MODEL_3D_MANTLE_PERTUBATIONS) &
+  if (MODEL_3D_MANTLE_PERTUBATIONS) &
     open(unit=11,file=prname(1:len_trim(prname))//'AVS_DXelementsfaces_dvp_dvs.txt',status='unknown')
 
-! number of elements in AVS or DX file
+  ! number of elements in AVS or DX file
   write(IOUT,*) nspecface
 
   ispecface = 0
   do ispec = 1,nspec
-! only if on face
-  if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. &
-              iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then
-    iglob1=ibool(1,1,1,ispec)
-    iglob2=ibool(NGLLX,1,1,ispec)
-    iglob3=ibool(NGLLX,NGLLY,1,ispec)
-    iglob4=ibool(1,NGLLY,1,ispec)
-    iglob5=ibool(1,1,NGLLZ,ispec)
-    iglob6=ibool(NGLLX,1,NGLLZ,ispec)
-    iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
-    iglob8=ibool(1,NGLLY,NGLLZ,ispec)
-
-! include lateral variations if needed
-
-   if (MODEL_3D_MANTLE_PERTUBATIONS) then
- !   pick a point within the element and get its radius
-     r=dsqrt(xstore(2,2,2,ispec)**2+ystore(2,2,2,ispec)**2+zstore(2,2,2,ispec)**2)
-
-     if (r > RCMB/R_PLANET .and. r < R_UNIT_SPHERE) then
- !     average over the element
-       dvp = 0.0
-       dvs = 0.0
-       np  = 0
-       do k=2,NGLLZ-1
-         do j=2,NGLLY-1
-           do i=2,NGLLX-1
-             np=np+1
-             x=xstore(i,j,k,ispec)
-             y=ystore(i,j,k,ispec)
-             z=zstore(i,j,k,ispec)
-             r=dsqrt(x*x+y*y+z*z)
-             ! take out ellipticity
-             if (ELLIPTICITY) then
-               call xyz_2_rthetaphi_dble(x,y,z,r,theta,phi_dummy)
-               cost=dcos(theta)
-! this is the Legendre polynomial of degree two, P2(cos(theta)), see the discussion above eq (14.4) in Dahlen and Tromp (1998)
-               p20=0.5d0*(3.0d0*cost*cost-1.0d0)
-! get ellipticity using spline evaluation
-               call spline_evaluation(rspl,ellipicity_spline,ellipicity_spline2,nspl,r,ell)
-! this is eq (14.4) in Dahlen and Tromp (1998)
-               factor=ONE-(TWO/3.0d0)*ell*p20
-               r=r/factor
-             endif
-
-
-             ! gets reference model values: rho,vpv,vph,vsv,vsh and eta_aniso
-             call meshfem3D_models_get1D_val(iregion_code,idoubling(ispec), &
-                                             r,rho,vpv,vph,vsv,vsh,eta_aniso, &
-                                             Qkappa,Qmu,RICB,RCMB, &
-                                             RTOPDDOUBLEPRIME,R80,R120,R220,R400,R670,R771, &
-                                             RMOHO,RMIDDLE_CRUST)
-
-             ! calculates isotropic values
-             vp = sqrt(((8.d0+4.d0*eta_aniso)*vph*vph + 3.d0*vpv*vpv &
-                     + (8.d0 - 8.d0*eta_aniso)*vsv*vsv)/15.d0)
-             vs = sqrt(((1.d0-2.d0*eta_aniso)*vph*vph + vpv*vpv &
-                     + 5.d0*vsh*vsh + (6.d0+4.d0*eta_aniso)*vsv*vsv)/15.d0)
-
-             if (abs(rhostore(i,j,k,ispec)) < 1.e-20) then
-               print *,' attention: rhostore close to zero',rhostore(i,j,k,ispec),r,i,j,k,ispec
-               dvp = 0.0
-               dvs = 0.0
-             else if (abs(sngl(vp)) < 1.e-20) then
-               print *,' attention: vp close to zero',sngl(vp),r,i,j,k,ispec
-               dvp = 0.0
-             else if (abs(sngl(vs)) < 1.e-20) then
-               print *,' attention: vs close to zero',sngl(vs),r,i,j,k,ispec
-               dvs = 0.0
-             else
-               dvp = dvp + (sqrt((kappavstore(i,j,k,ispec)+4.*muvstore(i,j,k,ispec)/3.)/rhostore(i,j,k,ispec)) - sngl(vp))/sngl(vp)
-               dvs = dvs + (sqrt(muvstore(i,j,k,ispec)/rhostore(i,j,k,ispec)) - sngl(vs))/sngl(vs)
-             endif
-
-           enddo
-         enddo
-       enddo
-       dvp = dvp / np
-       dvs = dvs / np
-    else
-       dvp = 0.0
-       dvs = 0.0
+    ! only if on face
+    if (iMPIcut_xi(1,ispec) .or. iMPIcut_xi(2,ispec) .or. &
+                iMPIcut_eta(1,ispec) .or. iMPIcut_eta(2,ispec)) then
+      iglob1=ibool(1,1,1,ispec)
+      iglob2=ibool(NGLLX,1,1,ispec)
+      iglob3=ibool(NGLLX,NGLLY,1,ispec)
+      iglob4=ibool(1,NGLLY,1,ispec)
+      iglob5=ibool(1,1,NGLLZ,ispec)
+      iglob6=ibool(NGLLX,1,NGLLZ,ispec)
+      iglob7=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+      iglob8=ibool(1,NGLLY,NGLLZ,ispec)
+
+      ! include lateral variations if needed
+
+      if (MODEL_3D_MANTLE_PERTUBATIONS) then
+        ! pick a point within the element and get its radius
+        r=dsqrt(xstore(2,2,2,ispec)**2+ystore(2,2,2,ispec)**2+zstore(2,2,2,ispec)**2)
+
+        if (r > RCMB/R_PLANET .and. r < R_UNIT_SPHERE) then
+          ! average over the element
+          dvp = 0.0
+          dvs = 0.0
+          np  = 0
+          do k=2,NGLLZ-1
+            do j=2,NGLLY-1
+              do i=2,NGLLX-1
+                np=np+1
+                x=xstore(i,j,k,ispec)
+                y=ystore(i,j,k,ispec)
+                z=zstore(i,j,k,ispec)
+                r=dsqrt(x*x+y*y+z*z)
+                ! take out ellipticity
+                if (ELLIPTICITY) then
+                  call xyz_2_rthetaphi_dble(x,y,z,r,theta,phi_dummy)
+                  cost=dcos(theta)
+                  ! this is the Legendre polynomial of degree two, P2(cos(theta)),
+                  ! see the discussion above eq (14.4) in Dahlen and Tromp (1998)
+                  p20=0.5d0*(3.0d0*cost*cost-1.0d0)
+                  ! get ellipticity using spline evaluation
+                  call spline_evaluation(rspl,ellipicity_spline,ellipicity_spline2,nspl,r,ell)
+                  ! this is eq (14.4) in Dahlen and Tromp (1998)
+                  factor=ONE-(TWO/3.0d0)*ell*p20
+                  r=r/factor
+                endif
+
+                ! gets reference model values: rho,vpv,vph,vsv,vsh and eta_aniso
+                call meshfem3D_models_get1D_val(iregion_code,idoubling(ispec), &
+                                                r,rho,vpv,vph,vsv,vsh,eta_aniso, &
+                                                Qkappa,Qmu,RICB,RCMB, &
+                                                RTOPDDOUBLEPRIME,R80,R120,R220,R400,R670,R771, &
+                                                RMOHO,RMIDDLE_CRUST)
+
+                ! calculates isotropic values
+                vp = sqrt(((8.d0+4.d0*eta_aniso)*vph*vph + 3.d0*vpv*vpv &
+                       + (8.d0 - 8.d0*eta_aniso)*vsv*vsv)/15.d0)
+                vs = sqrt(((1.d0-2.d0*eta_aniso)*vph*vph + vpv*vpv &
+                       + 5.d0*vsh*vsh + (6.d0+4.d0*eta_aniso)*vsv*vsv)/15.d0)
+
+                if (abs(rhostore(i,j,k,ispec)) < 1.e-20) then
+                  print *,' attention: rhostore close to zero',rhostore(i,j,k,ispec),r,i,j,k,ispec
+                  dvp = 0.0
+                  dvs = 0.0
+                else if (abs(sngl(vp)) < 1.e-20) then
+                  print *,' attention: vp close to zero',sngl(vp),r,i,j,k,ispec
+                  dvp = 0.0
+                else if (abs(sngl(vs)) < 1.e-20) then
+                  print *,' attention: vs close to zero',sngl(vs),r,i,j,k,ispec
+                  dvs = 0.0
+                else
+                  dvp = dvp &
+                      + (sqrt((kappavstore(i,j,k,ispec)+4.*muvstore(i,j,k,ispec)/3.)/rhostore(i,j,k,ispec)) - sngl(vp))/sngl(vp)
+                  dvs = dvs &
+                      + (sqrt(muvstore(i,j,k,ispec)/rhostore(i,j,k,ispec)) - sngl(vs))/sngl(vs)
+                endif
+
+              enddo
+            enddo
+          enddo
+          dvp = dvp / np
+          dvs = dvs / np
+        else
+          dvp = 0.0
+          dvs = 0.0
+        endif
+      endif
+
+      ! face xi = xi_min
+      if (iMPIcut_xi(1,ispec)) then
+        ispecface = ispecface + 1
+        write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob1), &
+                      num_ibool_AVS_DX(iglob4),num_ibool_AVS_DX(iglob8), &
+                      num_ibool_AVS_DX(iglob5)
+        if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs
+      endif
+
+      ! face xi = xi_max
+      if (iMPIcut_xi(2,ispec)) then
+        ispecface = ispecface + 1
+        write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob2), &
+                      num_ibool_AVS_DX(iglob3),num_ibool_AVS_DX(iglob7), &
+                      num_ibool_AVS_DX(iglob6)
+        if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs
+      endif
+
+      ! face eta = eta_min
+      if (iMPIcut_eta(1,ispec)) then
+        ispecface = ispecface + 1
+        write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob1), &
+                      num_ibool_AVS_DX(iglob2),num_ibool_AVS_DX(iglob6), &
+                      num_ibool_AVS_DX(iglob5)
+        if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs
+      endif
+
+      ! face eta = eta_max
+      if (iMPIcut_eta(2,ispec)) then
+        ispecface = ispecface + 1
+        write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob4), &
+                      num_ibool_AVS_DX(iglob3),num_ibool_AVS_DX(iglob7), &
+                      num_ibool_AVS_DX(iglob8)
+        if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs
+      endif
+
     endif
- endif
-
-! face xi = xi_min
-  if (iMPIcut_xi(1,ispec)) then
-    ispecface = ispecface + 1
-    write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob1), &
-                  num_ibool_AVS_DX(iglob4),num_ibool_AVS_DX(iglob8), &
-                  num_ibool_AVS_DX(iglob5)
-    if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs
-  endif
-
-! face xi = xi_max
-  if (iMPIcut_xi(2,ispec)) then
-    ispecface = ispecface + 1
-    write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob2), &
-                  num_ibool_AVS_DX(iglob3),num_ibool_AVS_DX(iglob7), &
-                  num_ibool_AVS_DX(iglob6)
-    if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs
-  endif
-
-! face eta = eta_min
-  if (iMPIcut_eta(1,ispec)) then
-    ispecface = ispecface + 1
-    write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob1), &
-                  num_ibool_AVS_DX(iglob2),num_ibool_AVS_DX(iglob6), &
-                  num_ibool_AVS_DX(iglob5)
-    if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs
-  endif
-
-! face eta = eta_max
-  if (iMPIcut_eta(2,ispec)) then
-    ispecface = ispecface + 1
-    write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglob4), &
-                  num_ibool_AVS_DX(iglob3),num_ibool_AVS_DX(iglob7), &
-                  num_ibool_AVS_DX(iglob8)
-    if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs
-  endif
-
-  endif
   enddo
 
-! check that number of surface elements output is okay
+  ! check that number of surface elements output is okay
   if (ispecface /= nspecface) &
     call exit_MPI(myrank,'incorrect number of surface elements in AVS or DX file creation')
 
diff --git a/src/meshfem3D/write_AVS_DX_surface_data.f90 b/src/meshfem3D/write_AVS_DX_surface_data.f90
index 27ba5dc14..ff9ddca76 100644
--- a/src/meshfem3D/write_AVS_DX_surface_data.f90
+++ b/src/meshfem3D/write_AVS_DX_surface_data.f90
@@ -64,11 +64,11 @@ subroutine write_AVS_DX_surface_data(prname,nspec,iboun, &
   real(kind=CUSTOM_REAL) :: muvstore(NGLLX,NGLLY,NGLLZ,nspec)
   real(kind=CUSTOM_REAL) :: rhostore(NGLLX,NGLLY,NGLLZ,nspec)
 
-! logical mask used to output global points only once
+  ! logical mask used to output global points only once
   integer :: npointot
   logical :: mask_ibool(npointot)
 
-! numbering of global AVS or DX points
+  ! numbering of global AVS or DX points
   integer :: num_ibool_AVS_DX(npointot)
 
   integer :: ispec
@@ -76,212 +76,212 @@ subroutine write_AVS_DX_surface_data(prname,nspec,iboun, &
   integer, dimension(8) :: iglobval
   integer :: npoin,numpoin,nspecface,ispecface
 
-! for ellipticity
+  ! for ellipticity
   integer :: nspl
   double precision :: rspl(NR_DENSITY),ellipicity_spline(NR_DENSITY),ellipicity_spline2(NR_DENSITY)
 
-! processor identification
+  ! processor identification
   character(len=MAX_STRING_LEN) :: prname
 
   integer :: iregion_code
 
-! writing points
+  ! writing points
   open(unit=IOUT,file=prname(1:len_trim(prname))//'AVS_DXpointssurface.txt',status='unknown')
 
-! erase the logical mask used to mark points already found
+  ! erase the logical mask used to mark points already found
   mask_ibool(:) = .false.
 
   nspecface = 0
 
-! mark global AVS or DX points
+  ! mark global AVS or DX points
   do ispec = 1,nspec
-! only if at the surface (top plane)
-  if (iboun(6,ispec)) then
-
-    iglobval(5)=ibool(1,1,NGLLZ,ispec)
-    iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec)
-    iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec)
-    iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec)
-
-! element is at the surface
-    nspecface = nspecface + 1
-    mask_ibool(iglobval(5)) = .true.
-    mask_ibool(iglobval(6)) = .true.
-    mask_ibool(iglobval(7)) = .true.
-    mask_ibool(iglobval(8)) = .true.
-
-  endif
+    ! only if at the surface (top plane)
+    if (iboun(6,ispec)) then
+
+      iglobval(5)=ibool(1,1,NGLLZ,ispec)
+      iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec)
+      iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+      iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec)
+
+      ! element is at the surface
+      nspecface = nspecface + 1
+      mask_ibool(iglobval(5)) = .true.
+      mask_ibool(iglobval(6)) = .true.
+      mask_ibool(iglobval(7)) = .true.
+      mask_ibool(iglobval(8)) = .true.
+
+    endif
   enddo
 
-! count global number of AVS or DX points
+  ! count global number of AVS or DX points
   npoin = count(mask_ibool(:))
 
-! number of points in AVS or DX file
+  ! number of points in AVS or DX file
   write(IOUT,*) npoin
 
-! erase the logical mask used to mark points already found
+  ! erase the logical mask used to mark points already found
   mask_ibool(:) = .false.
 
-! output global AVS or DX points
+  ! output global AVS or DX points
   numpoin = 0
   do ispec = 1,nspec
-! only if at the surface
-  if (iboun(6,ispec)) then
-
-    iglobval(5)=ibool(1,1,NGLLZ,ispec)
-    iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec)
-    iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec)
-    iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec)
-
-! top face
-  if (iboun(6,ispec)) then
-
-    if (.not. mask_ibool(iglobval(5))) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglobval(5)) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), &
-                    sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec))
-    endif
+    ! only if at the surface
+    if (iboun(6,ispec)) then
+
+      iglobval(5)=ibool(1,1,NGLLZ,ispec)
+      iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec)
+      iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+      iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec)
+
+      ! top face
+      if (iboun(6,ispec)) then
+
+        if (.not. mask_ibool(iglobval(5))) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglobval(5)) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(1,1,NGLLZ,ispec)), &
+                        sngl(ystore(1,1,NGLLZ,ispec)),sngl(zstore(1,1,NGLLZ,ispec))
+        endif
 
-    if (.not. mask_ibool(iglobval(6))) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglobval(6)) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), &
-                    sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec))
-    endif
+        if (.not. mask_ibool(iglobval(6))) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglobval(6)) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(NGLLX,1,NGLLZ,ispec)), &
+                        sngl(ystore(NGLLX,1,NGLLZ,ispec)),sngl(zstore(NGLLX,1,NGLLZ,ispec))
+        endif
 
-    if (.not. mask_ibool(iglobval(7))) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglobval(7)) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), &
-                    sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec))
-    endif
+        if (.not. mask_ibool(iglobval(7))) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglobval(7)) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(NGLLX,NGLLY,NGLLZ,ispec)), &
+                        sngl(ystore(NGLLX,NGLLY,NGLLZ,ispec)),sngl(zstore(NGLLX,NGLLY,NGLLZ,ispec))
+        endif
 
-    if (.not. mask_ibool(iglobval(8))) then
-      numpoin = numpoin + 1
-      num_ibool_AVS_DX(iglobval(8)) = numpoin
-      write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), &
-                    sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec))
-    endif
+        if (.not. mask_ibool(iglobval(8))) then
+          numpoin = numpoin + 1
+          num_ibool_AVS_DX(iglobval(8)) = numpoin
+          write(IOUT,*) numpoin,sngl(xstore(1,NGLLY,NGLLZ,ispec)), &
+                        sngl(ystore(1,NGLLY,NGLLZ,ispec)),sngl(zstore(1,NGLLY,NGLLZ,ispec))
+        endif
 
-    mask_ibool(iglobval(5)) = .true.
-    mask_ibool(iglobval(6)) = .true.
-    mask_ibool(iglobval(7)) = .true.
-    mask_ibool(iglobval(8)) = .true.
+        mask_ibool(iglobval(5)) = .true.
+        mask_ibool(iglobval(6)) = .true.
+        mask_ibool(iglobval(7)) = .true.
+        mask_ibool(iglobval(8)) = .true.
 
-  endif
+      endif
 
-  endif
+    endif
   enddo
 
-! check that number of global points output is okay
+  ! check that number of global points output is okay
   if (numpoin /= npoin) &
     call exit_MPI(myrank,'incorrect number of global points in AVS or DX file creation')
 
   close(IOUT)
 
-! output global AVS or DX elements
+  ! output global AVS or DX elements
 
-! writing elements
+  ! writing elements
   open(unit=IOUT,file=prname(1:len_trim(prname))//'AVS_DXelementssurface.txt',status='unknown')
   if (MODEL_3D_MANTLE_PERTUBATIONS) &
        open(unit=11,file=prname(1:len_trim(prname))//'AVS_DXelementssurface_dvp_dvs.txt',status='unknown')
 
-! number of elements in AVS or DX file
+  ! number of elements in AVS or DX file
   write(IOUT,*) nspecface
 
   ispecface = 0
   do ispec = 1,nspec
-! only if at the surface
-     if (iboun(6,ispec)) then
-
-        iglobval(5)=ibool(1,1,NGLLZ,ispec)
-        iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec)
-        iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec)
-        iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec)
-
-                if (MODEL_3D_MANTLE_PERTUBATIONS) then
-           !   pick a point within the element and get its radius
-           r=dsqrt(xstore(2,2,2,ispec)**2+ystore(2,2,2,ispec)**2+zstore(2,2,2,ispec)**2)
-
-           if (r > RCMB/R_PLANET .and. r < R_UNIT_SPHERE) then
-              !     average over the element
-              dvp = 0.0
-              dvs = 0.0
-              np  = 0
-              do k=2,NGLLZ-1
-                 do j=2,NGLLY-1
-                    do i=2,NGLLX-1
-                       np=np+1
-                       x=xstore(i,j,k,ispec)
-                       y=ystore(i,j,k,ispec)
-                       z=zstore(i,j,k,ispec)
-                       r=dsqrt(x*x+y*y+z*z)
-                       ! take out ellipticity
-                       if (ELLIPTICITY) then
-                          call xyz_2_rthetaphi_dble(x,y,z,r,theta,phi_dummy)
-                          cost=dcos(theta)
-! this is the Legendre polynomial of degree two, P2(cos(theta)), see the discussion above eq (14.4) in Dahlen and Tromp (1998)
-                          p20=0.5d0*(3.0d0*cost*cost-1.0d0)
-! get ellipticity using spline evaluation
-                          call spline_evaluation(rspl,ellipicity_spline,ellipicity_spline2,nspl,r,ell)
-! this is eq (14.4) in Dahlen and Tromp (1998)
-                          factor=ONE-(TWO/3.0d0)*ell*p20
-                          r=r/factor
-                       endif
-
-
-                       ! gets reference model values: rho,vpv,vph,vsv,vsh and eta_aniso
-                       call meshfem3D_models_get1D_val(iregion_code,idoubling(ispec), &
-                                                       r,rho,vpv,vph,vsv,vsh,eta_aniso, &
-                                                       Qkappa,Qmu,RICB,RCMB, &
-                                                       RTOPDDOUBLEPRIME,R80,R120,R220,R400,R670,R771, &
-                                                       RMOHO,RMIDDLE_CRUST)
-
-                       ! calculates isotropic values
-                       vp = sqrt(((8.d0+4.d0*eta_aniso)*vph*vph + 3.d0*vpv*vpv &
-                            + (8.d0 - 8.d0*eta_aniso)*vsv*vsv)/15.d0)
-                       vs = sqrt(((1.d0-2.d0*eta_aniso)*vph*vph + vpv*vpv &
-                            + 5.d0*vsh*vsh + (6.d0+4.d0*eta_aniso)*vsv*vsv)/15.d0)
-
-                       if (abs(rhostore(i,j,k,ispec)) < 1.e-20) then
-                          print *,' attention: rhostore close to zero',rhostore(i,j,k,ispec),r,i,j,k,ispec
-                          dvp = 0.0
-                          dvs = 0.0
-                       else if (abs(sngl(vp)) < 1.e-20) then
-                          print *,' attention: vp close to zero',sngl(vp),r,i,j,k,ispec
-                          dvp = 0.0
-                       else if (abs(sngl(vs)) < 1.e-20) then
-                          print *,' attention: vs close to zero',sngl(vs),r,i,j,k,ispec
-                          dvs = 0.0
-                       else
-                          dvp = dvp + (sqrt((kappavstore(i,j,k,ispec)+4.*muvstore(i,j,k,ispec)/3.) &
-                               /rhostore(i,j,k,ispec)) - sngl(vp))/sngl(vp)
-                          dvs = dvs + (sqrt(muvstore(i,j,k,ispec)/rhostore(i,j,k,ispec)) - sngl(vs))/sngl(vs)
-                       endif
-
-                    enddo
-                 enddo
+    ! only if at the surface
+    if (iboun(6,ispec)) then
+
+      iglobval(5)=ibool(1,1,NGLLZ,ispec)
+      iglobval(6)=ibool(NGLLX,1,NGLLZ,ispec)
+      iglobval(7)=ibool(NGLLX,NGLLY,NGLLZ,ispec)
+      iglobval(8)=ibool(1,NGLLY,NGLLZ,ispec)
+
+      if (MODEL_3D_MANTLE_PERTUBATIONS) then
+        !   pick a point within the element and get its radius
+        r=dsqrt(xstore(2,2,2,ispec)**2+ystore(2,2,2,ispec)**2+zstore(2,2,2,ispec)**2)
+
+        if (r > RCMB/R_PLANET .and. r < R_UNIT_SPHERE) then
+          !     average over the element
+          dvp = 0.0
+          dvs = 0.0
+          np  = 0
+          do k=2,NGLLZ-1
+            do j=2,NGLLY-1
+              do i=2,NGLLX-1
+                np=np+1
+                x=xstore(i,j,k,ispec)
+                y=ystore(i,j,k,ispec)
+                z=zstore(i,j,k,ispec)
+                r=dsqrt(x*x+y*y+z*z)
+                ! take out ellipticity
+                if (ELLIPTICITY) then
+                  call xyz_2_rthetaphi_dble(x,y,z,r,theta,phi_dummy)
+                  cost=dcos(theta)
+                  ! this is the Legendre polynomial of degree two, P2(cos(theta)),
+                  ! see the discussion above eq (14.4) in Dahlen and Tromp (1998)
+                  p20=0.5d0*(3.0d0*cost*cost-1.0d0)
+                  ! get ellipticity using spline evaluation
+                  call spline_evaluation(rspl,ellipicity_spline,ellipicity_spline2,nspl,r,ell)
+                  ! this is eq (14.4) in Dahlen and Tromp (1998)
+                  factor=ONE-(TWO/3.0d0)*ell*p20
+                  r=r/factor
+                endif
+
+                ! gets reference model values: rho,vpv,vph,vsv,vsh and eta_aniso
+                call meshfem3D_models_get1D_val(iregion_code,idoubling(ispec), &
+                                                r,rho,vpv,vph,vsv,vsh,eta_aniso, &
+                                                Qkappa,Qmu,RICB,RCMB, &
+                                                RTOPDDOUBLEPRIME,R80,R120,R220,R400,R670,R771, &
+                                                RMOHO,RMIDDLE_CRUST)
+
+                ! calculates isotropic values
+                vp = sqrt(((8.d0+4.d0*eta_aniso)*vph*vph + 3.d0*vpv*vpv &
+                    + (8.d0 - 8.d0*eta_aniso)*vsv*vsv)/15.d0)
+                vs = sqrt(((1.d0-2.d0*eta_aniso)*vph*vph + vpv*vpv &
+                    + 5.d0*vsh*vsh + (6.d0+4.d0*eta_aniso)*vsv*vsv)/15.d0)
+
+                if (abs(rhostore(i,j,k,ispec)) < 1.e-20) then
+                  print *,' attention: rhostore close to zero',rhostore(i,j,k,ispec),r,i,j,k,ispec
+                  dvp = 0.0
+                  dvs = 0.0
+                else if (abs(sngl(vp)) < 1.e-20) then
+                  print *,' attention: vp close to zero',sngl(vp),r,i,j,k,ispec
+                  dvp = 0.0
+                else if (abs(sngl(vs)) < 1.e-20) then
+                  print *,' attention: vs close to zero',sngl(vs),r,i,j,k,ispec
+                  dvs = 0.0
+                else
+                  dvp = dvp + (sqrt((kappavstore(i,j,k,ispec)+4.*muvstore(i,j,k,ispec)/3.) &
+                       /rhostore(i,j,k,ispec)) - sngl(vp))/sngl(vp)
+                  dvs = dvs + (sqrt(muvstore(i,j,k,ispec)/rhostore(i,j,k,ispec)) - sngl(vs))/sngl(vs)
+                endif
+
               enddo
-              dvp = dvp / np
-              dvs = dvs / np
-           else
-              dvp = 0.0
-              dvs = 0.0
-           endif
+            enddo
+          enddo
+          dvp = dvp / np
+          dvs = dvs / np
+        else
+          dvp = 0.0
+          dvs = 0.0
         endif
+      endif
 
-        ! top face
-        ispecface = ispecface + 1
-        write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglobval(5)), &
-                      num_ibool_AVS_DX(iglobval(6)),num_ibool_AVS_DX(iglobval(7)), &
-                      num_ibool_AVS_DX(iglobval(8))
-        if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs
+      ! top face
+      ispecface = ispecface + 1
+      write(IOUT,*) ispecface,idoubling(ispec),num_ibool_AVS_DX(iglobval(5)), &
+                    num_ibool_AVS_DX(iglobval(6)),num_ibool_AVS_DX(iglobval(7)), &
+                    num_ibool_AVS_DX(iglobval(8))
+      if (MODEL_3D_MANTLE_PERTUBATIONS) write(11,*) ispecface,dvp,dvs
 
-     endif
+    endif
   enddo
 
-! check that number of surface elements output is okay
+  ! check that number of surface elements output is okay
   if (ispecface /= nspecface) &
     call exit_MPI(myrank,'incorrect number of surface elements in AVS or DX file creation')
 
diff --git a/src/shared/parallel.f90 b/src/shared/parallel.f90
index c781031a1..a6847d538 100644
--- a/src/shared/parallel.f90
+++ b/src/shared/parallel.f90
@@ -924,8 +924,18 @@ end subroutine max_all_all_cr
 !-------------------------------------------------------------------------------------------------
 !
 
-!  subroutine max_all_dp(sendbuf, recvbuf)
-!  end subroutine max_all_dp
+  subroutine max_all_dp(sendbuf, recvbuf)
+
+  use my_mpi
+
+  implicit none
+
+  double precision :: sendbuf, recvbuf
+  integer :: ier
+
+  call MPI_REDUCE(sendbuf,recvbuf,1,MPI_DOUBLE_PRECISION,MPI_MAX,0,my_local_mpi_comm_world,ier)
+
+  end subroutine max_all_dp
 
 !
 !-------------------------------------------------------------------------------------------------
diff --git a/src/shared/read_compute_parameters.f90 b/src/shared/read_compute_parameters.f90
index 3c1edeedc..b13acbaf0 100644
--- a/src/shared/read_compute_parameters.f90
+++ b/src/shared/read_compute_parameters.f90
@@ -219,8 +219,12 @@ subroutine rcp_set_compute_parameters()
   if (STEADY_STATE_KERNEL) then
     NSTEP_STEADY_STATE = nint(STEADY_STATE_LENGTH_IN_MINUTES * 60.d0 / DT)
 
+    ! checks length
     if (NSTEP_STEADY_STATE == 0) then
+      print *, '*****************************************************************'
       print *, 'Warning: STEADY_STATE_KERNEL disabled because STEADY_STATE_LENGTH_IN_MINUTES is zero'
+      print *, '*****************************************************************'
+      STEADY_STATE_KERNEL = .false.    ! not used any further, but doesn't hurt to reset flag to .false. ...
     endif
   else
     NSTEP_STEADY_STATE = 0
@@ -600,15 +604,18 @@ subroutine rcp_check_parameters()
       stop 'Please set SAVE_ALL_SEISMOS_IN_ONE_FILE and USE_BINARY_FOR_LARGE_FILE to be .false. for noise simulation'
   endif
 
-!! DK DK for gravity integrals
+  ! gravity integrals
   ! in the case of GRAVITY_INTEGRALS we should always use double precision
   if (GRAVITY_INTEGRALS .and. CUSTOM_REAL /= SIZE_DOUBLE) &
-    stop 'for GRAVITY_INTEGRALS use double precision i.e. configure the code with --enable-double-precision'
+    stop 'For GRAVITY_INTEGRALS use double precision i.e. configure the code with --enable-double-precision'
 
   ! adjoint simulations: seismogram output only works if each process writes out its local seismos
   if (WRITE_SEISMOGRAMS_BY_MAIN .and. SIMULATION_TYPE == 2) &
     stop 'For SIMULATION_TYPE == 2, please set WRITE_SEISMOGRAMS_BY_MAIN to .false.'
 
+  if (NTSTEP_BETWEEN_OUTPUT_SAMPLE < 1) &
+    stop 'Invalid NTSTEP_BETWEEN_OUTPUT_SAMPLE, must be >= 1'
+
 !----------------------------------------------
 !
 ! status of implementation
diff --git a/src/specfem3D/check_stability.f90 b/src/specfem3D/check_stability.f90
index eb849cee1..1c8ef1eda 100644
--- a/src/specfem3D/check_stability.f90
+++ b/src/specfem3D/check_stability.f90
@@ -192,7 +192,7 @@ subroutine check_stability()
       call check_norm_elastic_acoustic_from_device(b_Usolidnorm,b_Ufluidnorm,Mesh_pointer,3)
     endif
 
-! this trick checks for NaN (Not a Number), which is not even equal to itself
+    ! this trick checks for NaN (Not a Number), which is not even equal to itself
     if (b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0 .or. b_Usolidnorm /= b_Usolidnorm) &
       call exit_MPI(myrank,'backward simulation became unstable and blew up  in the solid')
     if (b_Ufluidnorm > STABILITY_THRESHOLD .or. b_Ufluidnorm < 0 .or. b_Ufluidnorm /= b_Ufluidnorm) &
@@ -290,11 +290,13 @@ subroutine check_stability()
     write(IMAIN,"(' Elapsed time in hh:mm:ss = ',i6,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
     write(IMAIN,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it)
 
-! do not check before MIN_TIME_STEPS_FOR_SLOW_NODES time steps
-! because the time step estimate (which is an average) may then be unreliable
-    if (CHECK_FOR_SLOW_NODES .and. NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. it >= MIN_TIME_STEPS_FOR_SLOW_NODES .and. &
+    ! do not check before MIN_TIME_STEPS_FOR_SLOW_NODES time steps
+    ! because the time step estimate (which is an average) may then be unreliable
+    if (CHECK_FOR_SLOW_NODES) then
+      if (NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. it >= MIN_TIME_STEPS_FOR_SLOW_NODES .and. &
           tCPU/dble(it) > TOLERANCE_FACTOR_FOR_SLOW_NODES * REFERENCE_TIME_PER_TIME_STEP_ON_NORMAL_NODES) &
         I_am_running_on_a_slow_node = .true.
+    endif
 
     if (SHOW_SEPARATE_RUN_INFORMATION) then
       write(IMAIN,*) 'Time steps done for this run = ',it_run,' out of ',nstep_run
@@ -496,7 +498,7 @@ subroutine check_stability_backward()
     call check_norm_elastic_acoustic_from_device(b_Usolidnorm,b_Ufluidnorm,Mesh_pointer,3)
   endif
 
-! this trick checks for NaN (Not a Number), which is not even equal to itself
+  ! this trick checks for NaN (Not a Number), which is not even equal to itself
   if (b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0 .or. b_Usolidnorm /= b_Usolidnorm) &
     call exit_MPI(myrank,'backward simulation became unstable and blew up  in the solid')
   if (b_Ufluidnorm > STABILITY_THRESHOLD .or. b_Ufluidnorm < 0 .or. b_Ufluidnorm /= b_Ufluidnorm) &
@@ -556,7 +558,7 @@ subroutine check_stability_backward()
     ! check stability of the code, exit if unstable
     ! negative values can occur with some compilers when the unstable value is greater
     ! than the greatest possible floating-point number of the machine
-! this trick checks for NaN (Not a Number), which is not even equal to itself
+    ! this trick checks for NaN (Not a Number), which is not even equal to itself
     if (b_Usolidnorm_all > STABILITY_THRESHOLD .or. b_Usolidnorm_all < 0 .or. b_Usolidnorm_all /= b_Usolidnorm_all) &
       call exit_MPI(myrank,'backward simulation became unstable and blew up in the solid')
     if (b_Ufluidnorm_all > STABILITY_THRESHOLD .or. b_Ufluidnorm_all < 0 .or. b_Ufluidnorm_all /= b_Ufluidnorm_all) &
@@ -710,19 +712,21 @@ subroutine write_timestamp_file(Usolidnorm_all,Ufluidnorm_all,b_Usolidnorm_all,b
 
   close(IOUT)
 
-! if the run is slow and gives up, create a disk file to indicate it, so that users or batch scripts can know it.
-! do not check before MIN_TIME_STEPS_FOR_SLOW_NODES time steps
-! because the time step estimate (which is an average) may then be unreliable
-  if (CHECK_FOR_SLOW_NODES .and. NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. it >= MIN_TIME_STEPS_FOR_SLOW_NODES .and. &
+  ! if the run is slow and gives up, create a disk file to indicate it, so that users or batch scripts can know it.
+  ! do not check before MIN_TIME_STEPS_FOR_SLOW_NODES time steps
+  ! because the time step estimate (which is an average) may then be unreliable
+  if (CHECK_FOR_SLOW_NODES) then
+    if (NUMBER_OF_SIMULTANEOUS_RUNS > 1 .and. it >= MIN_TIME_STEPS_FOR_SLOW_NODES .and. &
         tCPU/dble(it) > TOLERANCE_FACTOR_FOR_SLOW_NODES * REFERENCE_TIME_PER_TIME_STEP_ON_NORMAL_NODES) then
-    I_am_running_on_a_slow_node = .true.
-! rank is between 0 and the max rank - 1, for the earthquake here we start at 1, i.e. we use mygroup + 1 rather than mygroup
-    write(outputname,"('/slow_run_on_rank_',i6.6,'_giving_up_for_earthquake_run_',i6.6,'_at_time_step_',i6.6)") &
+      I_am_running_on_a_slow_node = .true.
+      ! rank is between 0 and the max rank - 1, for the earthquake here we start at 1, i.e. we use mygroup + 1 rather than mygroup
+      write(outputname,"('/slow_run_on_rank_',i6.6,'_giving_up_for_earthquake_run_',i6.6,'_at_time_step_',i6.6)") &
              myrank,mygroup + 1,it
-    open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown',action='write')
-! write outputname as the information message, since it contains all the information needed
+      open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown',action='write')
+      ! write outputname as the information message, since it contains all the information needed
       write(IOUT,*) outputname
-    close(IOUT)
+      close(IOUT)
+    endif
   endif
 
   end subroutine write_timestamp_file
diff --git a/src/specfem3D/compute_add_sources.f90 b/src/specfem3D/compute_add_sources.f90
index d4139675b..4f3dfb825 100644
--- a/src/specfem3D/compute_add_sources.f90
+++ b/src/specfem3D/compute_add_sources.f90
@@ -38,7 +38,7 @@ subroutine compute_add_sources()
   double precision :: stf
   real(kind=CUSTOM_REAL) :: stf_used
   ! for gpu
-  double precision, dimension(NSOURCES) :: stf_pre_compute
+  !double precision, dimension(NSOURCES) :: stf_pre_compute ! obsolete
 
   double precision, external :: get_stf_viscoelastic
 
@@ -51,7 +51,8 @@ subroutine compute_add_sources()
     ! note: the LDDRK scheme updates displacement after the stiffness computations and
     !       after adding boundary/coupling/source terms.
     !       thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme
-    !       when entering this routine. we therefore at an additional -DT to have the corresponding timing for the source.
+    !       when entering this routine. we therefore at an additional -DT to have the corresponding timing
+    !       for the source.
     time_t = dble(it-1-1)*DT + dble(C_LDDRK(istage))*DT - t0
   else
     time_t = dble(it-1)*DT - t0
@@ -103,20 +104,19 @@ subroutine compute_add_sources()
 
   else
     ! on GPU
-    ! prepares buffer with source time function values, to be copied onto GPU
-    do isource = 1,NSOURCES
-      ! sets current time for this source
-      timeval = time_t - tshift_src(isource)
-
-      ! determines source time function value
-      stf = get_stf_viscoelastic(timeval,isource,it)
-
-      ! stores current stf values
-      stf_pre_compute(isource) = stf
-    enddo
+    ! we avoid these copies as they synchronize streams and kernels by using local source time function arrays instead
+    !! prepares buffer with source time function values, to be copied onto GPU
+    !do isource = 1,NSOURCES
+    !  ! sets current time for this source
+    !  timeval = time_t - tshift_src(isource)
+    !  ! determines source time function value
+    !  stf = get_stf_viscoelastic(timeval,isource,it)
+    !  ! stores current stf values
+    !  stf_pre_compute(isource) = stf
+    !enddo
 
     ! adds sources: only implements SIMTYPE=1 and NOISE_TOM = 0
-    call compute_add_sources_gpu(Mesh_pointer,NSOURCES,stf_pre_compute)
+    call compute_add_sources_gpu(Mesh_pointer,it,istage)
   endif
 
   end subroutine compute_add_sources
@@ -360,7 +360,7 @@ subroutine compute_add_sources_backward()
   double precision :: stf
   real(kind=CUSTOM_REAL) :: stf_used
   ! for gpu
-  double precision, dimension(NSOURCES) :: stf_pre_compute
+  !double precision, dimension(NSOURCES) :: stf_pre_compute ! obsolete
 
   double precision, external :: get_stf_viscoelastic
 
@@ -414,7 +414,8 @@ subroutine compute_add_sources_backward()
     ! note: the LDDRK scheme updates displacement after the stiffness computations and
     !       after adding boundary/coupling/source terms.
     !       thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme
-    !       when entering this routine. we therefore at an additional -DT to have the corresponding timing for the source.
+    !       when entering this routine. we therefore at an additional -DT to have the corresponding timing
+    !       for the source.
     if (UNDO_ATTENUATION) then
       ! stepping moves forward from snapshot position
       time_t = dble(NSTEP-it_tmp-1)*DT + dble(C_LDDRK(istage))*DT - t0
@@ -462,20 +463,19 @@ subroutine compute_add_sources_backward()
 
   else
     ! on GPU
-    ! prepares buffer with source time function values, to be copied onto GPU
-    do isource = 1,NSOURCES
-      ! sets current time for this source
-      timeval = time_t - tshift_src(isource)
-
-      ! determines source time function value
-      stf = get_stf_viscoelastic(timeval,isource,it_tmp)
-
-      ! stores current stf values
-      stf_pre_compute(isource) = stf
-    enddo
+    ! we avoid these copies as they synchronize streams and kernels by using local source time function arrays instead
+    !! prepares buffer with source time function values, to be copied onto GPU
+    !do isource = 1,NSOURCES
+    !  ! sets current time for this source
+    !  timeval = time_t - tshift_src(isource)
+    !  ! determines source time function value
+    !  stf = get_stf_viscoelastic(timeval,isource,it_tmp)
+    !  ! stores current stf values
+    !  stf_pre_compute(isource) = stf
+    !enddo
 
     ! adds sources: only implements SIMTYPE=3 (and NOISE_TOM = 0)
-    call compute_add_sources_backward_gpu(Mesh_pointer,NSOURCES,stf_pre_compute)
+    call compute_add_sources_backward_gpu(Mesh_pointer,it_tmp,istage)
   endif
 
   end subroutine compute_add_sources_backward
diff --git a/src/specfem3D/compute_forces_acoustic_calling_routine.F90 b/src/specfem3D/compute_forces_acoustic_calling_routine.F90
index 60abb381b..33968d80c 100644
--- a/src/specfem3D/compute_forces_acoustic_calling_routine.F90
+++ b/src/specfem3D/compute_forces_acoustic_calling_routine.F90
@@ -53,7 +53,8 @@ subroutine compute_forces_acoustic()
     ! note: the LDDRK scheme updates displacement after the stiffness computations and
     !       after adding boundary/coupling/source terms.
     !       thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme
-    !       when entering this routine. we therefore at an additional -DT to have the corresponding timing for the source.
+    !       when entering this routine. we therefore at an additional -DT to have the corresponding timing
+    !       for the source.
     timeval = real((dble(it-1-1)*DT + dble(C_LDDRK(istage))*DT - t0)*scale_t_inv, kind=CUSTOM_REAL)
   else
     timeval = real((dble(it-1)*DT - t0)*scale_t_inv, kind=CUSTOM_REAL)
@@ -265,7 +266,8 @@ subroutine compute_forces_acoustic_backward()
     ! note: the LDDRK scheme updates displacement after the stiffness computations and
     !       after adding boundary/coupling/source terms.
     !       thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme
-    !       when entering this routine. we therefore at an additional -DT to have the corresponding timing for the source.
+    !       when entering this routine. we therefore at an additional -DT to have the corresponding timing
+    !       for the source.
     if (UNDO_ATTENUATION) then
       ! stepping moves forward from snapshot position
       b_timeval = real((dble(NSTEP-it_tmp-1)*DT + dble(C_LDDRK(istage))*DT - t0)*scale_t_inv, kind=CUSTOM_REAL)
diff --git a/src/specfem3D/compute_seismograms.F90 b/src/specfem3D/compute_seismograms.F90
index d7833b0bf..2a8adec7e 100644
--- a/src/specfem3D/compute_seismograms.F90
+++ b/src/specfem3D/compute_seismograms.F90
@@ -31,7 +31,7 @@ subroutine compute_seismograms(nglob,displ,seismo_current,seismograms)
   use constants_solver
 
   use specfem_par, only: &
-    NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
+    nlength_seismogram, &
     nrec_local,nu_rec,ispec_selected_rec,number_receiver_global, &
     scale_displ,hxir_store,hetar_store,hgammar_store
 
@@ -44,7 +44,7 @@ subroutine compute_seismograms(nglob,displ,seismo_current,seismograms)
 
   integer,intent(in) :: seismo_current
 
-  real(kind=CUSTOM_REAL), dimension(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),intent(out) :: &
+  real(kind=CUSTOM_REAL), dimension(NDIM,nrec_local,nlength_seismogram),intent(out) :: &
     seismograms
 
   ! local parameters
@@ -98,7 +98,6 @@ subroutine compute_seismograms_adjoint(displ_crust_mantle, &
                                          eps_trace_over_3_crust_mantle, &
                                          epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
                                          epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
-                                         it_adj_written, &
                                          moment_der,sloc_der,stshift_der,shdur_der, &
                                          seismograms)
 
@@ -108,7 +107,9 @@ subroutine compute_seismograms_adjoint(displ_crust_mantle, &
     NSPEC_CRUST_MANTLE_STRAIN_ONLY,NSPEC_CRUST_MANTLE_STR_OR_ATT
 
   use specfem_par, only: &
-    NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS,UNDO_ATTENUATION, &
+    NSTEP,NTSTEP_BETWEEN_OUTPUT_SAMPLE, &
+    nlength_seismogram,seismo_current, &
+    UNDO_ATTENUATION, &
     nrec_local, &
     nu_source,Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
     hxir_store,hpxir_store,hetar_store,hpetar_store,hgammar_store,hpgammar_store, &
@@ -134,13 +135,11 @@ subroutine compute_seismograms_adjoint(displ_crust_mantle, &
     epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
     epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle
 
-  integer,intent(in) :: it_adj_written
-
   real(kind=CUSTOM_REAL), dimension(NDIM,NDIM,nrec_local),intent(inout) :: moment_der
   real(kind=CUSTOM_REAL), dimension(NDIM,nrec_local),intent(inout) :: sloc_der
   real(kind=CUSTOM_REAL), dimension(nrec_local),intent(inout) :: stshift_der, shdur_der
 
-  real(kind=CUSTOM_REAL), dimension(NDIM*NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),intent(out) :: &
+  real(kind=CUSTOM_REAL), dimension(NDIM*NDIM,nrec_local,nlength_seismogram),intent(out) :: &
     seismograms
 
   ! local parameters
@@ -246,14 +245,14 @@ subroutine compute_seismograms_adjoint(displ_crust_mantle, &
     eps_loc_new(:,:) = matmul(matmul(nu_source(:,:,irec),eps_loc(:,:)), transpose(nu_source(:,:,irec)))
 
     ! distinguish between single and double precision for reals
-    seismograms(1,irec_local,it-it_adj_written) = real(eps_loc_new(1,1), kind=CUSTOM_REAL)
-    seismograms(2,irec_local,it-it_adj_written) = real(eps_loc_new(2,2), kind=CUSTOM_REAL)
-    seismograms(3,irec_local,it-it_adj_written) = real(eps_loc_new(3,3), kind=CUSTOM_REAL)
-    seismograms(4,irec_local,it-it_adj_written) = real(eps_loc_new(1,2), kind=CUSTOM_REAL)
-    seismograms(5,irec_local,it-it_adj_written) = real(eps_loc_new(1,3), kind=CUSTOM_REAL)
-    seismograms(6,irec_local,it-it_adj_written) = real(eps_loc_new(2,3), kind=CUSTOM_REAL)
-
-    seismograms(7:9,irec_local,it-it_adj_written) = real(scale_displ*(nu_source(:,1,irec)*uxd + &
+    seismograms(1,irec_local,seismo_current) = real(eps_loc_new(1,1), kind=CUSTOM_REAL)
+    seismograms(2,irec_local,seismo_current) = real(eps_loc_new(2,2), kind=CUSTOM_REAL)
+    seismograms(3,irec_local,seismo_current) = real(eps_loc_new(3,3), kind=CUSTOM_REAL)
+    seismograms(4,irec_local,seismo_current) = real(eps_loc_new(1,2), kind=CUSTOM_REAL)
+    seismograms(5,irec_local,seismo_current) = real(eps_loc_new(1,3), kind=CUSTOM_REAL)
+    seismograms(6,irec_local,seismo_current) = real(eps_loc_new(2,3), kind=CUSTOM_REAL)
+
+    seismograms(7:9,irec_local,seismo_current) = real(scale_displ*(nu_source(:,1,irec)*uxd + &
                                                                       nu_source(:,2,irec)*uyd + &
                                                                       nu_source(:,3,irec)*uzd), &
                                                                       kind=CUSTOM_REAL)
@@ -281,12 +280,15 @@ subroutine compute_seismograms_adjoint(displ_crust_mantle, &
                             gammax_crust_mantle(1,1,1,ispec),gammay_crust_mantle(1,1,1,ispec),gammaz_crust_mantle(1,1,1,ispec))
 
     ! reverse time
-    timeval = dble(NSTEP-it)*DT-t0-tshift_src(irec)
+    timeval = dble(NSTEP-it) * DT - t0 - tshift_src(irec)
 
     ! gets source-time function value
-    stf = get_stf_viscoelastic(timeval,irec,it)
+    !#TODO: double-check if time at it or NSTEP-it+1
+    !       since adjoint simulation uses reversed adjoint sources, source time seems to correspond to NSTEP-(it-1)==NSTEP-it+1
+    !       therefore, when reading in an external STF, the index would be `NSTEP-it+1` rather than `it`
+    stf = get_stf_viscoelastic(timeval,irec,NSTEP-it+1)
 
-    stf_deltat = stf * deltat
+    stf_deltat = real(stf * deltat * NTSTEP_BETWEEN_OUTPUT_SAMPLE,kind=CUSTOM_REAL)
 
     ! moment derivatives
     moment_der(:,:,irec_local) = moment_der(:,:,irec_local) + eps_s(:,:) * stf_deltat
diff --git a/src/specfem3D/get_attenuation.f90 b/src/specfem3D/get_attenuation.f90
index 771087ab5..4a61c72c4 100644
--- a/src/specfem3D/get_attenuation.f90
+++ b/src/specfem3D/get_attenuation.f90
@@ -383,8 +383,18 @@ subroutine get_attenuation_scale_factor(T_c_source, tau_mu, tau_sigma, Q_mu, sca
   !print *,'debug: factor scale mu0,mu = ',factor_scale_mu0,factor_scale_mu,'approximated Q = A/B',a_val/b_val
 
   !--- check that the correction factor is close to one
-  if (scale_factor < 0.8d0 .or. scale_factor > 1.2d0) then
-    print *,'Error: incorrect scale factor: ', scale_factor,'scale mu',factor_scale_mu,'scale mu0',factor_scale_mu0
+  ! note: for very coarse global simulations, the reference frequency (usually 1Hz for global models)
+  !       and center frequency of the simulation can be far apart.
+  !       the scale_factor limiting range here is somewhat set arbitrarily, and enlarged to [0.5,1.5]
+  !       to allow for very coarse global simulation tests with sponge attenuation.
+  if (scale_factor < 0.5d0 .or. scale_factor > 1.5d0) then
+    print *,'Error: incorrect scale factor: ', scale_factor
+    print *,'  scale factor: ', scale_factor,' should be between 0.5 to 1.5'
+    print *,'  factor scale_mu = ',factor_scale_mu,' factor scale_mu0 = ',factor_scale_mu0
+    print *,'  Q value = ',Q_mu
+    print *,'  central period = ',T_c_source * scale_t,' frequency = ',f_c_source / scale_t
+    print *,'  model frequency = ',f_0_model / scale_t
+    print *,'Please check your reference frequency ATTENUATION_f0_REFERENCE in file constants.h'
     call exit_MPI(myrank,'incorrect correction factor in attenuation model')
   endif
 
diff --git a/src/specfem3D/iterate_time.F90 b/src/specfem3D/iterate_time.F90
index 3d5e199ba..eaa94f442 100644
--- a/src/specfem3D/iterate_time.F90
+++ b/src/specfem3D/iterate_time.F90
@@ -90,6 +90,9 @@ subroutine iterate_time()
     close(IOUT)
   endif
 
+  ! synchronizes GPU kernels
+  if (GPU_MODE) call gpu_synchronize()
+
   ! initialize variables for writing seismograms
   seismo_offset = it_begin-1
   seismo_current = 0
diff --git a/src/specfem3D/iterate_time_undoatt.F90 b/src/specfem3D/iterate_time_undoatt.F90
index a81811fef..65a00f4f9 100644
--- a/src/specfem3D/iterate_time_undoatt.F90
+++ b/src/specfem3D/iterate_time_undoatt.F90
@@ -253,6 +253,9 @@ subroutine iterate_time_undoatt()
     close(IOUT)
   endif
 
+  ! synchronizes GPU kernels
+  if (GPU_MODE) call gpu_synchronize()
+
   ! initializes variables for writing seismograms
   seismo_offset = it_begin-1
   seismo_current = 0
@@ -477,7 +480,7 @@ subroutine iterate_time_undoatt()
           it_of_buffer = it_of_buffer + 1
 
           if (GPU_MODE) then
-#if defined(USE_CUDA) || defined(USE_OPENCL)
+#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_OPENCL)
             if (it_of_buffer >= 2) then
               call unregister_host_array(b_displ_cm_store_buffer(:,:, it_of_buffer-1))
             endif
@@ -593,7 +596,7 @@ subroutine iterate_time_undoatt()
         ! adjoint simulations: kernels
         ! attention: for GPU_MODE and ANISOTROPIC_KL it is necessary to use resort_array (see lines 442-445)
         if (mod(it, ntstep_kl) == 0) then
-#if defined(USE_CUDA) || defined(USE_OPENCL)
+#if defined(USE_CUDA) || defined(USE_HIP) || defined(USE_OPENCL)
           if (GPU_MODE) then
             call unregister_host_array(b_displ_cm_store_buffer(:,:, it_of_buffer+1))
             if (it_of_buffer > 0) then
diff --git a/src/specfem3D/locate_receivers.f90 b/src/specfem3D/locate_receivers.f90
index e408d7d79..af8ba5197 100644
--- a/src/specfem3D/locate_receivers.f90
+++ b/src/specfem3D/locate_receivers.f90
@@ -746,11 +746,10 @@ subroutine read_receiver_locations()
     ! close receiver file
     close(IIN)
 
-! BS BS begin
-! In case that the same station and network name appear twice (or more times) in the STATIONS
-! file, problems occur, as two (or more) seismograms are written (with mode
-! "append") to a file with same name. The philosophy here is to accept multiple
-! appearances and to just add a count to the station name in this case.
+    ! In case that the same station and network name appear twice (or more times) in the STATIONS
+    ! file, problems occur, as two (or more) seismograms are written (with mode
+    ! "append") to a file with same name. The philosophy here is to accept multiple
+    ! appearances and to just add a count to the station name in this case.
     allocate(station_duplet(nrec),stat=ier)
     if (ier /= 0 ) call exit_MPI(myrank,'Error allocating station_duplet array')
 
@@ -771,7 +770,6 @@ subroutine read_receiver_locations()
       enddo
     enddo
     deallocate(station_duplet)
-! BS BS end
 
     ! if receivers can not be buried, sets depth to zero
     if (.not. RECEIVERS_CAN_BE_BURIED ) stbur(:) = 0.d0
diff --git a/src/specfem3D/prepare_gpu.f90 b/src/specfem3D/prepare_gpu.f90
index d78bfca39..dca38b28b 100644
--- a/src/specfem3D/prepare_gpu.f90
+++ b/src/specfem3D/prepare_gpu.f90
@@ -41,6 +41,10 @@ subroutine prepare_GPU()
   ! local parameters
   real :: free_mb,used_mb,total_mb
   logical :: USE_3D_ATTENUATION_ARRAYS
+  ! local source arrays
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: stf_local,b_stf_local
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: sourcearrays_local
+  integer, dimension(:), allocatable :: ispec_selected_source_local
 
   ! checks if anything to do
   if (.not. GPU_MODE) return
@@ -62,12 +66,17 @@ subroutine prepare_GPU()
   ! evaluates memory required
   call memory_eval_gpu()
 
+  ! creates local source arrays for gpu simulations
+  call prepare_local_gpu_source_arrays()
+
   ! prepares general fields on GPU
   call prepare_constants_device(Mesh_pointer,myrank,NGLLX, &
                                 hprime_xx,hprimewgll_xx, &
                                 wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                NSOURCES, nsources_local, sourcearrays, &
-                                islice_selected_source,ispec_selected_source, &
+                                NSOURCES, nsources_local, &
+                                sourcearrays_local, stf_local, b_stf_local, &
+                                ispec_selected_source_local, &
+                                ispec_selected_source, &
                                 nrec, nrec_local, number_receiver_global, &
                                 islice_selected_rec,ispec_selected_rec, &
                                 NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
@@ -92,7 +101,10 @@ subroutine prepare_GPU()
                                 deltat, &
                                 GPU_ASYNC_COPY, &
                                 hxir_store,hetar_store,hgammar_store,nu_rec, &
-                                SAVE_SEISMOGRAMS_STRAIN,CUSTOM_REAL)
+                                nlength_seismogram, &
+                                SAVE_SEISMOGRAMS_STRAIN,CUSTOM_REAL, &
+                                USE_LDDRK, &
+                                NSTEP,NSTAGE_TIME_SCHEME)
   call synchronize_all()
 
   if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
@@ -416,6 +428,9 @@ subroutine prepare_GPU()
     call flush_IMAIN()
   endif
 
+  ! frees temporary arrays
+  deallocate(stf_local,b_stf_local,sourcearrays_local,ispec_selected_source_local)
+
   ! synchronizes processes
   call synchronize_all()
 
@@ -426,7 +441,7 @@ subroutine memory_eval_gpu()
     implicit none
 
     ! local parameters
-    double precision :: memory_size
+    double precision :: memory_size,memory_size_max
     integer,parameter :: NGLL2 = NGLLSQUARE
     integer,parameter :: NGLL3 = 125
     integer,parameter :: NGLL3_PADDED = 128
@@ -445,14 +460,32 @@ subroutine memory_eval_gpu()
 
     ! sources
     if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+      ! full NSOURCES not needed anymore...
       ! d_sourcearrays
-      memory_size = memory_size + NGLL3 * NSOURCES * NDIM * dble(CUSTOM_REAL)
+      !memory_size = memory_size + NGLL3 * NSOURCES * NDIM * dble(CUSTOM_REAL)
+      ! d_islice_selected_source,d_ispec_selected_source
+      !memory_size = memory_size + 2.d0 * NSOURCES * dble(SIZE_INTEGER)
+      ! local sources only
+      ! d_sourcearrays_local
+      memory_size = memory_size + NDIM * NGLL3 * nsources_local * dble(CUSTOM_REAL)
+      ! d_stf_local
+      memory_size = memory_size + nsources_local * NSTEP * NSTAGE_TIME_SCHEME *  dble(CUSTOM_REAL)
+      if (SIMULATION_TYPE == 3 .and. USE_LDDRK .and. (.not. UNDO_ATTENUATION)) then
+        ! d_b_stf_local
+        memory_size = memory_size + nsources_local * NSTEP * NSTAGE_TIME_SCHEME *  dble(CUSTOM_REAL)
+      endif
+      ! d_ispec_selected_source_local
+      memory_size = memory_size + nsources_local * dble(SIZE_INTEGER)
+    endif
+    if (SIMULATION_TYPE == 2) then
+      ! d_ispec_selected_source
+      memory_size = memory_size + NSOURCES * dble(SIZE_INTEGER)
     endif
-    ! d_islice_selected_source,d_ispec_selected_source
-    memory_size = memory_size + 2.d0 * NSOURCES * dble(SIZE_INTEGER)
 
     ! receivers
-    !d_number_receiver_global
+    ! d_seismograms
+    memory_size = memory_size + NDIM * nrec_local * nlength_seismogram * dble(CUSTOM_REAL)
+    ! d_number_receiver_global
     memory_size = memory_size + nrec_local * dble(SIZE_INTEGER)
     ! d_station_seismo_field
     memory_size = memory_size + NDIM * NGLL3 * nrec_local * dble(CUSTOM_REAL)
@@ -462,7 +495,6 @@ subroutine memory_eval_gpu()
     endif
     ! d_ispec_selected_rec
     memory_size = memory_size + nrec * dble(SIZE_INTEGER)
-
     ! d_adj_source_adjoint
     memory_size = memory_size + NDIM *  nadj_rec_local * dble(CUSTOM_REAL)
 
@@ -649,14 +681,181 @@ subroutine memory_eval_gpu()
     ! poor estimate for kernel simulations...
     if (SIMULATION_TYPE == 3) memory_size = 2.d0 * memory_size
 
+    ! main process gets maximum for user output
+    call max_all_dp(memory_size,memory_size_max)
+
     ! user output
     if (myrank == 0) then
       write(IMAIN,*)
-      write(IMAIN,*) "  minimum memory requested     : ",sngl(memory_size / 1024.d0 / 1024.d0),"MB per process"
+      write(IMAIN,*) "  minimum memory requested     : ",sngl(memory_size_max / 1024.d0 / 1024.d0),"MB per process"
       write(IMAIN,*)
       call flush_IMAIN()
     endif
 
     end subroutine memory_eval_gpu
 
+    !-------------------------------------------------------------------------------------------
+
+    subroutine prepare_local_gpu_source_arrays()
+
+    implicit none
+    ! local parameters
+    integer :: isource_local,isource,it_tmp,istage,ier
+    real(kind=CUSTOM_REAL) :: stf_used
+    double precision :: timeval,time_t
+    double precision :: stf
+    double precision, external :: get_stf_viscoelastic
+
+    ! allocates source arrays
+    if (nsources_local > 0) then
+      ! prepares source arrays for local sources
+      allocate(stf_local(nsources_local,NSTEP,NSTAGE_TIME_SCHEME), &
+               sourcearrays_local(NDIM,NGLLX,NGLLY,NGLLZ,nsources_local), &
+               ispec_selected_source_local(nsources_local), stat=ier)
+    else
+      ! dummy arrays
+      allocate(stf_local(1,1,1), &
+               sourcearrays_local(1,1,1,1,1), &
+               ispec_selected_source_local(1), stat=ier)
+    endif
+    if (ier /= 0) stop 'Error allocating local source arrays'
+
+    stf_local(:,:,:) = 0.0_CUSTOM_REAL
+    sourcearrays_local(:,:,:,:,:) = 0.0_CUSTOM_REAL
+    ispec_selected_source_local(:) = 0
+
+    ! for backward simulations w/ LDDRK
+    if (nsources_local > 0 .and. SIMULATION_TYPE == 3 .and. USE_LDDRK .and. (.not. UNDO_ATTENUATION)) then
+      allocate(b_stf_local(nsources,NSTEP,NSTAGE_TIME_SCHEME),stat=ier)
+    else
+      ! dummy
+      allocate(b_stf_local(1,1,1),stat=ier)
+    endif
+    if (ier /= 0) stop 'Error allocating local source b_stf_local arrays'
+    b_stf_local(:,:,:) = 0.0_CUSTOM_REAL
+
+    ! checks if anything to do
+    if (NOISE_TOMOGRAPHY /= 0) return
+
+    ! loops over all sources to gather local ones
+    isource_local = 0
+    do isource = 1,NSOURCES
+      if (myrank == islice_selected_source(isource)) then
+        ! counter
+        isource_local = isource_local + 1
+
+        ! checks
+        if (isource_local > nsources_local) call exit_MPI(myrank,'Error local source index exceeds nsources_local')
+
+        ! gets local source element
+        ispec_selected_source_local(isource_local) = ispec_selected_source(isource)
+        sourcearrays_local(:,:,:,:,isource_local) = sourcearrays(:,:,:,:,isource)
+
+        ! forward
+        do it_tmp = 1,NSTEP
+          ! time for simulation
+          do istage = 1, NSTAGE_TIME_SCHEME ! is equal to 1 if Newmark because only one stage then
+            ! sets current initial time
+            if (USE_LDDRK) then
+              ! LDDRK
+              ! note: the LDDRK scheme updates displacement after the stiffness computations and
+              !       after adding boundary/coupling/source terms.
+              !       thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme
+              !       when entering this routine. we therefore at an additional -DT to have the corresponding timing
+              !       for the source.
+              time_t = dble(it_tmp-1-1)*DT + dble(C_LDDRK(istage))*DT - t0
+            else
+              time_t = dble(it_tmp-1)*DT - t0
+            endif
+
+            ! sets current time for this source
+            timeval = time_t - tshift_src(isource)
+
+            ! source time function value (in range [-1,1]
+            stf = get_stf_viscoelastic(timeval,isource,it_tmp)
+
+            ! distinguishes between single and double precision for reals
+            stf_used = real(stf,kind=CUSTOM_REAL)
+
+            ! stores local source time function
+            ! we use an ordering (isource,it,istage) to make it easier to loop over many local sources
+            ! for a given time in the gpu kernel
+            stf_local(isource_local,it_tmp,istage) = stf_used
+          enddo ! istage
+        enddo
+
+        ! backward
+        if (SIMULATION_TYPE == 3) then
+          ! since there is a different forward and backward timing for the LDDRK scheme,
+          ! we allocate and compute the source time function again for kernel runs in `b_stf_local(:,:,:)`.
+          !
+          ! TODO in future: we might want to allocate only 1 source time function array `stf_local` and
+          !                 make sure that the indexing is producing the same stf values as needed.
+          do it_tmp = 1,NSTEP
+            do istage = 1, NSTAGE_TIME_SCHEME ! is equal to 1 if Newmark because only one stage then
+              ! note on backward/reconstructed wavefields:
+              !       time for b_displ( it ) corresponds to (NSTEP - (it-1) - 1 )*DT - t0  ...
+              !       as we start with saved wavefields b_displ( 1 ) = displ( NSTEP ) which correspond
+              !       to a time (NSTEP - 1)*DT - t0
+              !       (see sources for simulation_type 1 and seismograms)
+              !
+              !       now, at the beginning of the time loop, the numerical Newmark time scheme updates
+              !       the wavefields, that is b_displ( it=1) would correspond to time (NSTEP -1 - 1)*DT - t0.
+              !       however, we read in the backward/reconstructed wavefields at the end of the Newmark time scheme
+              !       in the first (it=1) time loop.
+              !       this leads to the timing (NSTEP-(it-1)-1)*DT-t0-tshift_src for the source time function here
+              !
+              ! sets current initial time
+              if (USE_LDDRK) then
+                ! LDDRK
+                ! note: the LDDRK scheme updates displacement after the stiffness computations and
+                !       after adding boundary/coupling/source terms.
+                !       thus, at each time loop step it, displ(:) is still at (n) and not (n+1) like for the Newmark scheme
+                !       when entering this routine. we therefore at an additional -DT to have the corresponding timing
+                !       for the source.
+                if (UNDO_ATTENUATION) then
+                  ! stepping moves forward from snapshot position
+                  time_t = dble(NSTEP-it_tmp-1)*DT + dble(C_LDDRK(istage))*DT - t0
+                else
+                  ! stepping backwards
+                  time_t = dble(NSTEP-it_tmp-1)*DT - dble(C_LDDRK(istage))*DT - t0
+                endif
+              else
+                time_t = dble(NSTEP-it_tmp)*DT - t0
+              endif
+
+              ! sets current time for this source
+              timeval = time_t - tshift_src(isource)
+
+              ! determines source time function value
+              stf = get_stf_viscoelastic(timeval,isource,it_tmp)
+
+              ! distinguishes between single and double precision for reals
+              stf_used = real(stf,kind=CUSTOM_REAL)
+
+              ! stores local source time function
+              if (USE_LDDRK .and. (.not. UNDO_ATTENUATION)) then
+                ! only needed to store for LDDRK stepping backwards, as LDDRK uses non-symmetric step lengths (C_LDDRK(istage))
+                ! for all other cases, we can use the forward stf_local array with corresponding indexing.
+                ! for example, Newmark stepping:
+                !    backward time_t = dble(NSTEP-it_tmp)*DT - t0 with it_tmp 1..NSTEP
+                !    forward  time_t = dble(it_tmp-1)*DT - t0  with it_tmp 1..NSTEP
+                !    -> backward indexing:
+                !         it_tmp_backward == 1      -> it_tmp_forward = NSTEP
+                !         it_tmp_backward == 2      -> it_tmp_forward = NSTEP-1
+                !         ..
+                !         it_tmp_backward == NSTEP  -> it_tmp_forward = 1
+                !         and b_stf_local == stf_local(istage, NSTEP-(it_tmp_backward-1), isource_local)
+                b_stf_local(isource_local,it_tmp,istage) = stf_used
+              endif
+            enddo ! istage
+          enddo
+        endif   ! backward
+
+      endif
+    enddo  ! NSOURCES
+
+    end subroutine prepare_local_gpu_source_arrays
+
+
   end subroutine prepare_GPU
diff --git a/src/specfem3D/setup_sources_receivers.f90 b/src/specfem3D/setup_sources_receivers.f90
index e8e9c9cc4..d51128d09 100644
--- a/src/specfem3D/setup_sources_receivers.f90
+++ b/src/specfem3D/setup_sources_receivers.f90
@@ -707,7 +707,7 @@ subroutine setup_sources()
   endif
 
   ! sources
-  ! BS BS moved open statement and writing of first lines into sr.vtk before the
+  ! moved open statement and writing of first lines into sr.vtk before the
   ! call to locate_sources, where further write statements to that file follow
   if (myrank == 0) then
   ! write source and receiver VTK files for Paraview
@@ -968,6 +968,39 @@ subroutine setup_timesteps()
     endif
   endif
 
+  ! make sure NSTEP is a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE
+  ! if not, increase it a little bit, to the next multiple
+  if (mod(NSTEP,NTSTEP_BETWEEN_OUTPUT_SAMPLE) /= 0) then
+    if (NOISE_TOMOGRAPHY /= 0) then
+      if (myrank == 0) then
+        print *,'Noise simulation: Invalid number of NSTEP          = ',NSTEP
+        print *,'Must be a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE = ',NTSTEP_BETWEEN_OUTPUT_SAMPLE
+      endif
+      stop 'Error: NSTEP must be a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE'
+    else
+      NSTEP = (NSTEP/NTSTEP_BETWEEN_OUTPUT_SAMPLE + 1)*NTSTEP_BETWEEN_OUTPUT_SAMPLE
+      ! user output
+      if (myrank == 0) then
+        print *
+        print *,'NSTEP is not a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE'
+        print *,'thus increasing it automatically to the next multiple, which is ',NSTEP
+        print *
+      endif
+    endif
+  endif
+
+  ! output seismograms at least once at the end of the simulation
+  NTSTEP_BETWEEN_OUTPUT_SEISMOS = min(NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS)
+
+  ! make sure NSTEP_BETWEEN_OUTPUT_SEISMOS is a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE
+  if (mod(NTSTEP_BETWEEN_OUTPUT_SEISMOS,NTSTEP_BETWEEN_OUTPUT_SAMPLE) /= 0) then
+    if (myrank == 0) then
+      print *,'Invalid number of NTSTEP_BETWEEN_OUTPUT_SEISMOS    = ',NTSTEP_BETWEEN_OUTPUT_SEISMOS
+      print *,'Must be a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE = ',NTSTEP_BETWEEN_OUTPUT_SAMPLE
+    endif
+    stop 'Error: NTSTEP_BETWEEN_OUTPUT_SEISMOS must be a multiple of NTSTEP_BETWEEN_OUTPUT_SAMPLE'
+  endif
+
   ! time loop increments end
   it_end = NSTEP
 
@@ -1207,12 +1240,26 @@ subroutine setup_receivers()
   endif
 
   ! seismograms
+  ! check if we need to save seismos
+  if (SIMULATION_TYPE == 3 .and. (.not. SAVE_SEISMOGRAMS_IN_ADJOINT_RUN)) then
+    do_save_seismograms = .false.
+  else
+    do_save_seismograms = .true.
+  endif
+
+  ! sets local receivers to zero if no seismogram needs to be saved
+  if (.not. do_save_seismograms) nrec_local = 0
+
+  ! seismogram array length (to write out time portions of the full seismograms)
+  nlength_seismogram = NTSTEP_BETWEEN_OUTPUT_SEISMOS / NTSTEP_BETWEEN_OUTPUT_SAMPLE
+
   ! gather from secondary processes on main
   tmp_rec_local_all(:) = 0
   tmp_rec_local_all(0) = nrec_local
   if (NPROCTOT_VAL > 1) then
     call gather_all_singlei(nrec_local,tmp_rec_local_all,NPROCTOT_VAL)
   endif
+
   ! user output
   if (myrank == 0) then
     ! determines maximum number of local receivers and corresponding rank
@@ -1221,20 +1268,29 @@ subroutine setup_receivers()
     maxproc = maxloc(tmp_rec_local_all(:)) - 1
     ! seismograms array size in MB
     if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
-      ! seismograms need seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS)
-      sizeval = dble(maxrec) * dble(NDIM * NTSTEP_BETWEEN_OUTPUT_SEISMOS * CUSTOM_REAL / 1024. / 1024. )
+      ! seismograms need seismograms(NDIM,nrec_local,nlength_seismogram)
+      sizeval = dble(maxrec) * dble(NDIM * nlength_seismogram * CUSTOM_REAL / 1024. / 1024. )
     else
-      ! adjoint seismograms need seismograms(NDIM*NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS)
-      sizeval = dble(maxrec) * dble(NDIM * NDIM * NTSTEP_BETWEEN_OUTPUT_SEISMOS * CUSTOM_REAL / 1024. / 1024. )
+      ! adjoint seismograms need seismograms(NDIM*NDIM,nrec_local,nlength_seismogram)
+      sizeval = dble(maxrec) * dble(NDIM * NDIM * nlength_seismogram * CUSTOM_REAL / 1024. / 1024. )
     endif
+
     ! outputs info
     write(IMAIN,*) 'seismograms:'
-    if (WRITE_SEISMOGRAMS_BY_MAIN) then
-      write(IMAIN,*) '  seismograms written by main process only'
+    if (do_save_seismograms) then
+      if (WRITE_SEISMOGRAMS_BY_MAIN) then
+        write(IMAIN,*) '  seismograms written by main process only'
+      else
+        write(IMAIN,*) '  seismograms written by all processes'
+      endif
     else
-      write(IMAIN,*) '  seismograms written by all processes'
+      write(IMAIN,*) '  seismograms will not be saved'
     endif
+    write(IMAIN,*) '  Total number of simulation steps (NSTEP)                       = ',NSTEP
     write(IMAIN,*) '  writing out seismograms at every NTSTEP_BETWEEN_OUTPUT_SEISMOS = ',NTSTEP_BETWEEN_OUTPUT_SEISMOS
+    write(IMAIN,*) '  number of subsampling steps for seismograms                    = ',NTSTEP_BETWEEN_OUTPUT_SAMPLE
+    write(IMAIN,*) '  Total number of samples for seismograms                        = ',NSTEP/NTSTEP_BETWEEN_OUTPUT_SAMPLE
+    write(IMAIN,*)
     write(IMAIN,*) '  maximum number of local receivers is ',maxrec,' in slice ',maxproc(1)
     write(IMAIN,*) '  size of maximum seismogram array       = ', sngl(sizeval),'MB'
     write(IMAIN,*) '                                         = ', sngl(sizeval/1024.d0),'GB'
@@ -1777,11 +1833,11 @@ subroutine setup_receivers_precompute_intp()
 
     ! allocates seismogram array
     if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
-      allocate(seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
+      allocate(seismograms(NDIM,nrec_local,nlength_seismogram),stat=ier)
       if (ier /= 0) stop 'Error while allocating seismograms'
     else
       ! adjoint seismograms
-      allocate(seismograms(NDIM*NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
+      allocate(seismograms(NDIM*NDIM,nrec_local,nlength_seismogram),stat=ier)
       if (ier /= 0) stop 'Error while allocating adjoint seismograms'
 
       ! allocates Frechet derivatives array
@@ -1796,15 +1852,15 @@ subroutine setup_receivers_precompute_intp()
       stshift_der(:) = 0._CUSTOM_REAL
       shdur_der(:) = 0._CUSTOM_REAL
     endif
+
     ! initializes seismograms
     seismograms(:,:,:) = 0._CUSTOM_REAL
-    ! adjoint seismograms
-    it_adj_written = 0
+
   else
     ! dummy arrays
     ! allocates dummy array since we need it to pass as argument e.g. in write_seismograms() routine
     ! note: nrec_local is zero, Fortran 90/95 should allow zero-sized array allocation...
-    allocate(seismograms(NDIM,0,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
+    allocate(seismograms(NDIM,0,nlength_seismogram),stat=ier)
     if (ier /= 0) stop 'Error while allocating zero seismograms'
     ! dummy allocation
     allocate(hxir_store(1,1), &
@@ -1816,7 +1872,7 @@ subroutine setup_receivers_precompute_intp()
   ! strain seismograms
   if (SAVE_SEISMOGRAMS_STRAIN) then
     if (nrec_local > 0) then
-      allocate(seismograms_eps(6,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
+      allocate(seismograms_eps(6,nrec_local,nlength_seismogram),stat=ier)
       if (ier /= 0) stop 'Error while allocating strain seismograms'
       seismograms_eps(:,:,:) = 0._CUSTOM_REAL
     else
@@ -1928,7 +1984,7 @@ subroutine setup_receivers_precompute_intp()
 
   ! ASDF seismograms
   if (OUTPUT_SEISMOS_ASDF) then
-    if (.not. (SIMULATION_TYPE == 3 .and. (.not. SAVE_SEISMOGRAMS_IN_ADJOINT_RUN)) ) then
+    if (do_save_seismograms) then
       ! initializes the ASDF data structure by allocating arrays
       call init_asdf_data(nrec_local)
       call synchronize_all()
diff --git a/src/specfem3D/specfem3D_par.F90 b/src/specfem3D/specfem3D_par.F90
index d52c25681..b1b6e73aa 100644
--- a/src/specfem3D/specfem3D_par.F90
+++ b/src/specfem3D/specfem3D_par.F90
@@ -352,16 +352,15 @@ module specfem_par
   !-----------------------------------------------------------------
 
   ! seismograms
-  integer :: it_begin,it_end
   real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: seismograms
+
+  integer :: nlength_seismogram
   integer :: seismo_offset, seismo_current
+  logical :: do_save_seismograms
 
   ! strain seismograms
   real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: seismograms_eps
 
-  ! adjoint seismograms
-  integer :: it_adj_written
-
   ! for SAC headers for seismograms
   integer :: yr_SAC,jda_SAC,mo_SAC,da_SAC,ho_SAC,mi_SAC
   double precision :: sec_SAC
@@ -469,6 +468,7 @@ module specfem_par
   !-----------------------------------------------------------------
 
   integer :: it
+  integer :: it_begin,it_end
 
   ! non-dimensionalization
   double precision :: scale_t,scale_t_inv,scale_displ,scale_displ_inv,scale_veloc
diff --git a/src/specfem3D/write_output_ASCII.f90 b/src/specfem3D/write_output_ASCII.f90
index b7299bc21..96622b2cf 100644
--- a/src/specfem3D/write_output_ASCII.f90
+++ b/src/specfem3D/write_output_ASCII.f90
@@ -37,29 +37,27 @@ subroutine write_output_ASCII(seismogram_tmp,iorientation,sisname,sisname_big_fi
 
   use specfem_par, only: &
     DT,t0,NSTEP, &
-    seismo_offset,seismo_current, NTSTEP_BETWEEN_OUTPUT_SAMPLE, &
-    NTSTEP_BETWEEN_OUTPUT_SEISMOS,OUTPUT_FILES,SIMULATION_TYPE, &
+    seismo_offset,seismo_current, &
+    NTSTEP_BETWEEN_OUTPUT_SAMPLE, &
+    nlength_seismogram, &
+    OUTPUT_FILES,SIMULATION_TYPE, &
     SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE, &
     myrank
 
   implicit none
 
-  real(kind=CUSTOM_REAL), dimension(5,NTSTEP_BETWEEN_OUTPUT_SEISMOS) :: seismogram_tmp
+  real(kind=CUSTOM_REAL), dimension(5,nlength_seismogram) :: seismogram_tmp
 
   integer :: iorientation
   character(len=MAX_STRING_LEN) :: sisname,sisname_big_file
 
   ! local parameters
-  integer :: it
+  integer :: it_current
   integer :: ier,isample
-  integer :: seismo_current_used
-  double precision :: value
-  double precision :: timeval
+  real(kind=CUSTOM_REAL) :: value,time_t
+  double precision :: time_t_db
   character(len=MAX_STRING_LEN) :: sisname_2
 
-  ! actual seismogram length
-  seismo_current_used = ceiling(real(seismo_current) / NTSTEP_BETWEEN_OUTPUT_SAMPLE)
-
   ! add .ascii extension to seismogram file name for ASCII seismograms
   write(sisname_2,"('/',a,'.ascii')") trim(sisname)
 
@@ -85,28 +83,31 @@ subroutine write_output_ASCII(seismogram_tmp,iorientation,sisname,sisname_big_fi
   endif
 
   ! subtract half duration of the source to make sure travel time is correct
-  do isample = 1,seismo_current_used
+  do isample = 1,seismo_current
 
     ! seismogram value
-    value = dble(seismogram_tmp(iorientation,isample))
+    value = seismogram_tmp(iorientation,isample)
 
     ! current time increment
-    it = seismo_offset + isample
+    it_current = seismo_offset + isample
 
     ! current time
     if (SIMULATION_TYPE == 3) then
-      timeval = dble((NSTEP-it)*NTSTEP_BETWEEN_OUTPUT_SAMPLE)*DT - t0
+      time_t_db = dble(NSTEP - it_current * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0
     else
-      timeval = dble((it-1)*NTSTEP_BETWEEN_OUTPUT_SAMPLE)*DT - t0
+      time_t_db = dble( (it_current-1) * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0
     endif
 
+    ! converts time to CUSTOM_REAL for output
+    time_t = real(time_t_db,kind=CUSTOM_REAL)
+
     ! writes out to file
     if (SAVE_ALL_SEISMOS_IN_ONE_FILE .and. USE_BINARY_FOR_LARGE_FILE) then
       ! distinguish between single and double precision for reals
-      write(IOUT) real(timeval, kind=CUSTOM_REAL), real(value, kind=CUSTOM_REAL)
+      write(IOUT) time_t,value
     else
       ! distinguish between single and double precision for reals
-      write(IOUT,*) real(timeval, kind=CUSTOM_REAL), ' ', real(value, kind=CUSTOM_REAL)
+      write(IOUT,*) time_t,value
     endif
   enddo
 
diff --git a/src/specfem3D/write_output_ASDF.f90 b/src/specfem3D/write_output_ASDF.f90
index fbe8bf419..7c784d7b3 100644
--- a/src/specfem3D/write_output_ASDF.f90
+++ b/src/specfem3D/write_output_ASDF.f90
@@ -29,8 +29,8 @@
 !! \param nrec_local The number of receivers on the local processor
   subroutine init_asdf_data(nrec_local)
 
-  use constants, only: NDIM
-  use specfem_par, only: myrank,NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS
+  use constants, only: NDIM,myrank
+  use specfem_par, only: NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS
 
   use asdf_data, only: asdf_container
 
@@ -94,10 +94,11 @@ subroutine store_asdf_data(seismogram_tmp, irec_local, irec, chn, iorientation)
   use constants, only: CUSTOM_REAL,NDIM,myrank
 
   use specfem_par, only: &
+    seismo_current, &
+    nlength_seismogram, &
     station_name,network_name, &
-    seismo_current, NTSTEP_BETWEEN_OUTPUT_SEISMOS, myrank, &
     stlat, stlon, stele, stbur, &
-    ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SAMPLE
+    ROTATE_SEISMOGRAMS_RT
 
   use asdf_data, only: asdf_container
 
@@ -106,17 +107,13 @@ subroutine store_asdf_data(seismogram_tmp, irec_local, irec, chn, iorientation)
   ! Parameters
   character(len=4),intent(in) :: chn
   integer,intent(in) :: irec_local, irec, iorientation
-  real(kind=CUSTOM_REAL),dimension(5,NTSTEP_BETWEEN_OUTPUT_SEISMOS),intent(in) :: seismogram_tmp
+  real(kind=CUSTOM_REAL),dimension(5,nlength_seismogram),intent(in) :: seismogram_tmp
 
   ! local Variables
   integer :: length_station_name, length_network_name
   integer :: ier, i, index_increment
-  integer :: seismo_current_used
-
-  ! actual seismogram length
-  seismo_current_used = ceiling(real(seismo_current) / NTSTEP_BETWEEN_OUTPUT_SAMPLE)
 
-  ! BS BS: In case that components are rotated to radial and transverse, we need to
+  ! In case that components are rotated to radial and transverse, we need to
   ! change the way the trace index is incremented below. Otherwise, the code crashes
   ! in close_asdf_data() when trying to deallocate the records in asdf_container
   if (ROTATE_SEISMOGRAMS_RT) then
@@ -140,14 +137,14 @@ subroutine store_asdf_data(seismogram_tmp, irec_local, irec, chn, iorientation)
   asdf_container%receiver_el(irec_local) = stele(irec)
   asdf_container%receiver_dpt(irec_local) = stbur(irec)
 
-  allocate(asdf_container%records(i)%record(seismo_current_used),stat=ier)
+  allocate(asdf_container%records(i)%record(seismo_current),stat=ier)
   if (ier /= 0) call exit_MPI (myrank, 'Allocating ASDF container failed.')
 
   ! initializes
   asdf_container%records(i)%record(:) = 0.0
 
   ! seismogram as real data
-  asdf_container%records(i)%record(1:seismo_current_used) = real(seismogram_tmp(iorientation, 1:seismo_current_used),kind=4)
+  asdf_container%records(i)%record(1:seismo_current) = real(seismogram_tmp(iorientation, 1:seismo_current),kind=4)
 
   end subroutine store_asdf_data
 
@@ -210,7 +207,6 @@ subroutine write_asdf()
   integer :: num_stations
   integer :: stationxml_length
   integer :: nsamples  ! constant, as in SPECFEM
-  integer :: seismo_current_used
   double precision :: sampling_rate
   double precision :: startTime
   integer(kind=8) :: start_time
@@ -276,13 +272,10 @@ subroutine write_asdf()
   call world_duplicate(comm)
   call world_size(mysize)
 
-  ! actual seismogram length
-  seismo_current_used = ceiling(real(seismo_current) / NTSTEP_BETWEEN_OUTPUT_SAMPLE)
-
   num_stations = asdf_container%nrec_local
   sampling_rate = 1.0/(DT*NTSTEP_BETWEEN_OUTPUT_SAMPLE)
 
-  ! BS BS: The total number of samples to be written to the ASDF file should be NSTEP.
+  ! note:  The total number of samples to be written to the ASDF file should be NSTEP.
   !        seismo_current is equivalent to NTSTEP_BETWEEN_OUTPUT_SEISMOS (except when it==it_end), as only in this case
   !        the routine is called from write_seismograms()
   !nsamples = seismo_current * (NSTEP / NTSTEP_BETWEEN_OUTPUT_SEISMOS)
@@ -440,13 +433,13 @@ subroutine write_asdf()
   enddo
 
   call all_gather_all_ch(networks_names, &
-                        num_stations * MAX_LENGTH_NETWORK_NAME, &
-                        network_names_gather, &
-                        rcounts, &
-                        displs, &
-                        max_num_stations_gather, &
-                        MAX_LENGTH_NETWORK_NAME, &
-                        mysize)
+                         num_stations * MAX_LENGTH_NETWORK_NAME, &
+                         network_names_gather, &
+                         rcounts, &
+                         displs, &
+                         max_num_stations_gather, &
+                         MAX_LENGTH_NETWORK_NAME, &
+                         mysize)
 
   do i = 1, mysize
     displs(i) = (i-1) * max_num_stations_gather * NDIM
@@ -454,13 +447,13 @@ subroutine write_asdf()
   enddo
 
   call all_gather_all_ch(component_names, &
-                        num_stations*NDIM*NDIM, &
-                        component_names_gather, &
-                        rcounts*NDIM, &
-                        displs*NDIM, &
-                        max_num_stations_gather*NDIM, &
-                        NDIM, &
-                        mysize)
+                         num_stations*NDIM*NDIM, &
+                         component_names_gather, &
+                         rcounts*NDIM, &
+                         displs*NDIM, &
+                         max_num_stations_gather*NDIM, &
+                         NDIM, &
+                         mysize)
 
   ! Now gather all the coordiante information for these stations
   do i = 1, mysize
@@ -503,7 +496,7 @@ subroutine write_asdf()
   deallocate(displs)
   deallocate(rcounts)
 
-  allocate(one_seismogram(NDIM,seismo_current_used),stat=ier)
+  allocate(one_seismogram(NDIM,seismo_current),stat=ier)
   if (ier /= 0) call exit_MPI(myrank,'error allocating one_seismogram array')
   one_seismogram(:,:) = 0.0
 
@@ -515,7 +508,7 @@ subroutine write_asdf()
   if (seismo_offset == 0) then
     if (myrank == 0) then
       ! user output
-      write(IMAIN,*) 'creating ASDF file: ',trim(OUTPUT_FILES) // "synthetic.h5"
+      write(IMAIN,*) '  seismogram ASDF file: ',trim(OUTPUT_FILES) // "synthetic.h5"
       call flush_IMAIN()
 
       ! initialize HDF5
@@ -669,7 +662,7 @@ subroutine write_asdf()
           if (myrank == 0) then
             do i = 1, NDIM
               !write(*,*) j, l, l+i, size(asdf_container%records)
-              one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current_used)
+              one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current)
             enddo
           endif
 
@@ -678,14 +671,14 @@ subroutine write_asdf()
           if (myrank == current_proc) then
             !one_seismogram(:,:) = seismograms(:,j,:)
             do i = 1, NDIM
-              one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current_used)
+              one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current)
             enddo
             ! send (real) data
-            call send_r(one_seismogram,NDIM*seismo_current_used,receiver,itag)
+            call send_r(one_seismogram,NDIM*seismo_current,receiver,itag)
 
           else if (myrank == 0) then
             ! receive (real) data
-            call recv_r(one_seismogram,NDIM*seismo_current_used,sender,itag)
+            call recv_r(one_seismogram,NDIM*seismo_current,sender,itag)
 
           endif
         endif
@@ -714,7 +707,7 @@ subroutine write_asdf()
 
             ! writes (float) data
             call ASDF_write_partial_waveform_f(data_ids(i), &
-                                               one_seismogram(i,1:seismo_current_used), seismo_offset, seismo_current_used, ier)
+                                               one_seismogram(i,1:seismo_current), seismo_offset, seismo_current, ier)
             if (ier /= 0) call exit_MPI(myrank,'Error ASDF write partial waveform failed')
 
             call ASDF_close_dataset_f(data_ids(i), ier)
@@ -793,11 +786,11 @@ subroutine write_asdf()
                                     data_ids(i))
 
           if (myrank == current_proc) then
-            one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current_used)
+            one_seismogram(i,:) = asdf_container%records(l+i)%record(1:seismo_current)
 
             ! writes (float) data
             call ASDF_write_partial_waveform_f(data_ids(i), &
-                                               one_seismogram(i,1:seismo_current_used), seismo_offset, seismo_current_used, ier)
+                                               one_seismogram(i,1:seismo_current), seismo_offset, seismo_current, ier)
             if (ier /= 0) call exit_MPI(myrank,'Error ASDF parallel write partial waveform failed')
           endif
 
diff --git a/src/specfem3D/write_output_SAC.f90 b/src/specfem3D/write_output_SAC.f90
index 09c86e10c..e863774f8 100644
--- a/src/specfem3D/write_output_SAC.f90
+++ b/src/specfem3D/write_output_SAC.f90
@@ -39,8 +39,9 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi)
           DT,t0, &
           seismo_offset,seismo_current,it_end, &
           OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_SAC_BINARY, &
-          NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-          MODEL,OUTPUT_FILES,NTSTEP_BETWEEN_OUTPUT_SAMPLE
+          MODEL,OUTPUT_FILES, &
+          NTSTEP_BETWEEN_OUTPUT_SAMPLE, &
+          nlength_seismogram
 
   use specfem_par, only: &
           yr => yr_SAC,jda => jda_SAC,ho => ho_SAC,mi => mi_SAC,sec => sec_SAC, &
@@ -51,7 +52,7 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi)
 
   implicit none
 
-  real(kind=CUSTOM_REAL), dimension(5,NTSTEP_BETWEEN_OUTPUT_SEISMOS) :: seismogram_tmp
+  real(kind=CUSTOM_REAL), dimension(5,nlength_seismogram) :: seismogram_tmp
 
   integer :: irec
   integer :: iorientation
@@ -63,7 +64,7 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi)
 
   ! local parameters
   double precision :: btime
-  real, dimension(NTSTEP_BETWEEN_OUTPUT_SEISMOS) :: tmp
+  real, dimension(nlength_seismogram) :: tmp
   integer :: time_sec,isample
   character(len=MAX_STRING_LEN) :: sisname_2
 
@@ -116,7 +117,6 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi)
   logical, external :: is_leap_year
 
   integer :: imodulo_5
-  integer :: seismo_current_used
 
   !----------------------------------------------------------------
 
@@ -219,9 +219,6 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi)
   !USER2  = sngl(depth)
   !USER3  = sngl(cmt_hdur) !half duration from CMT if not changed to t0 = 0.d0 (point source)
 
-  ! actual seismogram length
-  seismo_current_used = ceiling(real(seismo_current) / NTSTEP_BETWEEN_OUTPUT_SAMPLE)
-
   ! to avoid compiler warnings
   value1 = elat
   value1 = elon
@@ -451,10 +448,10 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi)
 
     ! now write data - with five values per row:
     ! ---------------
-    imodulo_5 = mod(seismo_current_used,5)
+    imodulo_5 = mod(seismo_current,5)
     if (imodulo_5 == 0) then
       ! five values per row
-      do isample = 1+5,seismo_current_used+1,5
+      do isample = 1+5,seismo_current+1,5
         value1 = dble(seismogram_tmp(iorientation,isample-5))
         value2 = dble(seismogram_tmp(iorientation,isample-4))
         value3 = dble(seismogram_tmp(iorientation,isample-3))
@@ -464,7 +461,7 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi)
       enddo
     else
       ! five values per row as long as possible
-      do isample = 1+5,(seismo_current_used-imodulo_5)+1,5
+      do isample = 1+5,(seismo_current-imodulo_5)+1,5
         value1 = dble(seismogram_tmp(iorientation,isample-5))
         value2 = dble(seismogram_tmp(iorientation,isample-4))
         value3 = dble(seismogram_tmp(iorientation,isample-3))
@@ -473,10 +470,10 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi)
         write(IOUT_SAC,510) sngl(value1),sngl(value2),sngl(value3),sngl(value4),sngl(value5)
       enddo
       ! loads remaining values
-      if (imodulo_5 >= 1) value1 = dble(seismogram_tmp(iorientation,seismo_current_used-imodulo_5+1))
-      if (imodulo_5 >= 2) value2 = dble(seismogram_tmp(iorientation,seismo_current_used-imodulo_5+2))
-      if (imodulo_5 >= 3) value3 = dble(seismogram_tmp(iorientation,seismo_current_used-imodulo_5+3))
-      if (imodulo_5 >= 4) value4 = dble(seismogram_tmp(iorientation,seismo_current_used-imodulo_5+4))
+      if (imodulo_5 >= 1) value1 = dble(seismogram_tmp(iorientation,seismo_current-imodulo_5+1))
+      if (imodulo_5 >= 2) value2 = dble(seismogram_tmp(iorientation,seismo_current-imodulo_5+2))
+      if (imodulo_5 >= 3) value3 = dble(seismogram_tmp(iorientation,seismo_current-imodulo_5+3))
+      if (imodulo_5 >= 4) value4 = dble(seismogram_tmp(iorientation,seismo_current-imodulo_5+4))
       ! writes out last data line
       select case(imodulo_5)
       case (1)
@@ -626,7 +623,6 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi)
       call write_integer(LCALDA)        !(109)
       call write_integer(int(UNUSED))   !(110)
 
-
       ! write character header variables 111:302
       call write_character(KSTNM,8)         !(111:118)
       call write_character(KEVNM,16)         !(119:134)
@@ -655,10 +651,10 @@ subroutine write_output_SAC(seismogram_tmp,irec,iorientation,sisname,chn,phi)
     endif
 
     ! now write SAC time series to file
-    ! BS BS write whole time series at once (hope to increase I/O performance
-    ! compared to using a loop on it)
-    tmp(1:seismo_current_used) = real(seismogram_tmp(iorientation,1:seismo_current_used))
-    call write_n_real(tmp(1:seismo_current_used),seismo_current_used)
+    ! write whole time series at once (hope to increase I/O performance compared to using a loop on it)
+    tmp(1:seismo_current) = real(seismogram_tmp(iorientation,1:seismo_current))
+
+    call write_n_real(tmp(1:seismo_current),seismo_current)
 
     call close_file()
 
diff --git a/src/specfem3D/write_seismograms.f90 b/src/specfem3D/write_seismograms.f90
index 4f7e7d52e..f42ecbbc5 100644
--- a/src/specfem3D/write_seismograms.f90
+++ b/src/specfem3D/write_seismograms.f90
@@ -33,10 +33,16 @@ subroutine write_seismograms()
 
   use specfem_par, only: myrank,Mesh_pointer,GPU_MODE,GPU_ASYNC_COPY,SIMULATION_TYPE, &
     nrec_local,number_receiver_global,ispec_selected_rec,ispec_selected_source, &
-    it,it_begin,it_end,seismo_current,seismo_offset, seismograms,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
+    it,it_end, &
+    seismo_current,seismo_offset, &
+    seismograms, &
+    nlength_seismogram, &
+    NTSTEP_BETWEEN_OUTPUT_SEISMOS,NTSTEP_BETWEEN_OUTPUT_SAMPLE, &
+    do_save_seismograms, &
     WRITE_SEISMOGRAMS_BY_MAIN,OUTPUT_SEISMOS_ASDF, &
     SAVE_SEISMOGRAMS_IN_ADJOINT_RUN,SAVE_SEISMOGRAMS_STRAIN, &
-    it_adj_written,moment_der,sloc_der,shdur_der,stshift_der,scale_displ,NSTEP
+    moment_der,sloc_der,shdur_der,stshift_der, &
+    scale_displ
 
   use specfem_par_crustmantle, only: displ_crust_mantle,b_displ_crust_mantle, &
     eps_trace_over_3_crust_mantle,epsilondev_xx_crust_mantle,epsilondev_xy_crust_mantle,epsilondev_xz_crust_mantle, &
@@ -50,63 +56,68 @@ subroutine write_seismograms()
   double precision, external :: wtime
   double precision :: write_time_begin,write_time
 
-  ! update position in seismograms
-  seismo_current = seismo_current + 1
-
-  ! compute & store the seismograms only if there is at least one receiver located in this slice
-  if (nrec_local > 0) then
-
-    ! gets resulting array values onto CPU
-    if (GPU_MODE) then
-      ! for forward and kernel simulations, seismograms are computed by the GPU, thus no need to transfer the wavefield
-      ! gets field values from GPU
-      if (SIMULATION_TYPE == 2 .or. SAVE_SEISMOGRAMS_STRAIN) then
-        ! this transfers fields only in elements with stations for efficiency
-        call write_seismograms_transfer_gpu(Mesh_pointer, &
-                                            displ_crust_mantle,b_displ_crust_mantle, &
-                                            eps_trace_over_3_crust_mantle, &
-                                            epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
-                                            epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
-                                            number_receiver_global, &
-                                            ispec_selected_rec,ispec_selected_source, &
-                                            ibool_crust_mantle)
-        ! synchronizes field values from GPU
-        if (GPU_ASYNC_COPY) then
-          call transfer_seismo_from_device_async(Mesh_pointer, &
-                                                 displ_crust_mantle,b_displ_crust_mantle, &
-                                                 number_receiver_global,ispec_selected_rec,ispec_selected_source, &
-                                                 ibool_crust_mantle)
+  ! checks if anything to do
+  if (.not. do_save_seismograms) return
+
+  ! checks subsampling recurrence
+  if (mod(it-1,NTSTEP_BETWEEN_OUTPUT_SAMPLE) == 0) then
+
+    ! update position in seismograms
+    seismo_current = seismo_current + 1
+
+    ! check for edge effects
+    if (seismo_current < 1 .or. seismo_current > nlength_seismogram) &
+      call exit_mpi(myrank,'Error: seismo_current out of bounds in recording of seismograms')
+
+    ! compute & store the seismograms only if there is at least one receiver located in this slice
+    if (nrec_local > 0) then
+      ! gets resulting array values onto CPU
+      if (GPU_MODE) then
+        ! for forward and kernel simulations, seismograms are computed by the GPU, thus no need to transfer the wavefield
+        ! gets field values from GPU
+        if (SIMULATION_TYPE == 2 .or. SAVE_SEISMOGRAMS_STRAIN) then
+          ! this transfers fields only in elements with stations for efficiency
+          call write_seismograms_transfer_gpu(Mesh_pointer, &
+                                              displ_crust_mantle,b_displ_crust_mantle, &
+                                              eps_trace_over_3_crust_mantle, &
+                                              epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                                              epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                                              number_receiver_global, &
+                                              ispec_selected_rec,ispec_selected_source, &
+                                              ibool_crust_mantle)
+          ! synchronizes field values from GPU
+          if (GPU_ASYNC_COPY) then
+            call transfer_seismo_from_device_async(Mesh_pointer, &
+                                                   displ_crust_mantle,b_displ_crust_mantle, &
+                                                   number_receiver_global,ispec_selected_rec,ispec_selected_source, &
+                                                   ibool_crust_mantle)
+          endif
         endif
-      endif
-    endif ! GPU_MODE
+      endif ! GPU_MODE
 
-    ! computes traces at interpolated receiver locations
-    select case (SIMULATION_TYPE)
-    case (1)
-      ! forward run
-      if (.not. GPU_MODE) then
-        ! on CPU
-        call compute_seismograms(NGLOB_CRUST_MANTLE,displ_crust_mantle,seismo_current,seismograms)
-      else
-        ! on GPU
-        call compute_seismograms_gpu(Mesh_pointer,seismograms,seismo_current,it,it_end, &
-                                     scale_displ,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP)
-
-      endif
-    case (2)
-      ! adjoint run
-      call compute_seismograms_adjoint(displ_crust_mantle, &
-                                       eps_trace_over_3_crust_mantle, &
-                                       epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
-                                       epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
-                                       it_adj_written, &
-                                       moment_der,sloc_der,stshift_der,shdur_der, &
-                                       seismograms)
-    case (3)
-      ! kernel run
-      if (.not. GPU_MODE) then
-        ! on CPU
-        if (.not. ( SIMULATION_TYPE == 3 .and. (.not. SAVE_SEISMOGRAMS_IN_ADJOINT_RUN)) ) then
+      ! computes traces at interpolated receiver locations
+      select case (SIMULATION_TYPE)
+      case (1)
+        ! forward run
+        if (.not. GPU_MODE) then
+          ! on CPU
+          call compute_seismograms(NGLOB_CRUST_MANTLE,displ_crust_mantle,seismo_current,seismograms)
+        else
+          ! on GPU
+          call compute_seismograms_gpu(Mesh_pointer,seismograms,seismo_current,it,it_end,scale_displ,nlength_seismogram)
+        endif
+      case (2)
+        ! adjoint run
+        call compute_seismograms_adjoint(displ_crust_mantle, &
+                                         eps_trace_over_3_crust_mantle, &
+                                         epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                                         epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                                         moment_der,sloc_der,stshift_der,shdur_der, &
+                                         seismograms)
+      case (3)
+        ! kernel run
+        if (.not. GPU_MODE) then
+          ! on CPU
           if (OUTPUT_ADJOINT_WAVEFIELD_SEISMOGRAMS) then
             ! uncomment to output adjoint wavefield instead for seismogram output
             call compute_seismograms(NGLOB_CRUST_MANTLE_ADJOINT,displ_crust_mantle,seismo_current,seismograms)
@@ -114,32 +125,43 @@ subroutine write_seismograms()
             ! default, backward reconstructed wavefield seismos
             call compute_seismograms(NGLOB_CRUST_MANTLE_ADJOINT,b_displ_crust_mantle,seismo_current,seismograms)
           endif
+        else
+          ! on GPU
+          call compute_seismograms_gpu(Mesh_pointer,seismograms,seismo_current,it,it_end,scale_displ,nlength_seismogram)
         endif
-      else
-        ! on GPU
-        if (.not. ( SIMULATION_TYPE == 3 .and. (.not. SAVE_SEISMOGRAMS_IN_ADJOINT_RUN)) ) then
-          call compute_seismograms_gpu(Mesh_pointer,seismograms,seismo_current,it,it_end, &
-                                       scale_displ,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NSTEP)
-        endif
-      endif
-    end select
-
-    ! strain seismograms
-    if (SAVE_SEISMOGRAMS_STRAIN) then
-      select case (SIMULATION_TYPE)
-      case (1)
-        ! forward run
-        call compute_seismograms_strain(NGLOB_CRUST_MANTLE,displ_crust_mantle)
-      case (3)
-        ! kernel run
-        call compute_seismograms_strain(NGLOB_CRUST_MANTLE,b_displ_crust_mantle)
       end select
-    endif
 
-  endif ! nrec_local
+      ! strain seismograms
+      if (SAVE_SEISMOGRAMS_STRAIN) then
+        select case (SIMULATION_TYPE)
+        case (1)
+          ! forward run
+          call compute_seismograms_strain(NGLOB_CRUST_MANTLE,displ_crust_mantle)
+        case (3)
+          ! kernel run
+          call compute_seismograms_strain(NGLOB_CRUST_MANTLE,b_displ_crust_mantle)
+        end select
+      endif
+
+    endif ! nrec_local
+  endif
 
   ! write the current or final seismograms
-  if (seismo_current == NTSTEP_BETWEEN_OUTPUT_SEISMOS .or. it == it_end) then
+  !
+  ! for example:
+  !   NTSTEP_BETWEEN_OUTPUT_SEISMOS = 10 , NTSTEP_BETWEEN_OUTPUT_SAMPLE = 5
+  !       -> 10 / 5 == 2 steps (nlength_seismogram)
+  !          we will store samples at it==1,it==6 and output at it==10
+  !   NTSTEP_BETWEEN_OUTPUT_SEISMOS = 10 , NTSTEP_BETWEEN_OUTPUT_SAMPLE = 1
+  !       -> 10 / 1 == 10 steps
+  !          we will store samples at it==1,2,3,..,10 and output at it==10
+  !
+  !   that is for NTSTEP_BETWEEN_OUTPUT_SEISMOS = 10 , NTSTEP_BETWEEN_OUTPUT_SAMPLE = 5,
+  !   the seismograms have samples for the wavefield at mod((it-1),NTSTEP_BETWEEN_OUTPUT_SAMPLE) == 0,
+  !   which is at it==1,it==6. for writing the seismograms however,
+  !   we would write out at mod(it,NTSTEP_BETWEEN_OUTPUT_SEISMOS) == 0, which is at it==10
+  !
+  if (mod(it,NTSTEP_BETWEEN_OUTPUT_SEISMOS) == 0 .or. it == it_end) then
     ! timing
     write_time_begin = wtime()
 
@@ -158,7 +180,6 @@ subroutine write_seismograms()
       case (2)
         ! adjoint wavefield
         call write_adj_seismograms()
-        it_adj_written = it
       end select
     endif
 
@@ -174,13 +195,11 @@ subroutine write_seismograms()
       ! timing
       write_time = wtime() - write_time_begin
       ! output
-      write(IMAIN,*)
-      write(IMAIN,*) 'Total number of time steps written: ', it-it_begin+1
-      write(IMAIN,*)
+      write(IMAIN,*) 'Total number of time steps written: ', seismo_offset
       if (WRITE_SEISMOGRAMS_BY_MAIN) then
-        write(IMAIN,*) 'Writing the seismograms by main proc alone took ',write_time,' seconds'
+        write(IMAIN,*) 'Writing the seismograms by main proc alone took ',sngl(write_time),' seconds'
       else
-        write(IMAIN,*) 'Writing the seismograms in parallel took ',write_time,' seconds'
+        write(IMAIN,*) 'Writing the seismograms in parallel took ',sngl(write_time),' seconds'
       endif
       write(IMAIN,*)
       call flush_IMAIN()
@@ -199,20 +218,21 @@ subroutine write_seismograms_to_file()
   use constants_solver, only: MAX_STRING_LEN,CUSTOM_REAL,NDIM,IMAIN,IOUT,itag
 
   use specfem_par, only: &
-          NPROCTOT_VAL,myrank,nrec,nrec_local, &
-          number_receiver_global,seismograms, &
-          islice_num_rec_local, &
-          seismo_offset,seismo_current, &
-          station_name,network_name, &
-          OUTPUT_SEISMOS_ASCII_TEXT, &
-          OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_SAC_BINARY, &
-          OUTPUT_SEISMOS_ASDF, &
-          OUTPUT_SEISMOS_3D_ARRAY, &
-          NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-          SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE, &
-          OUTPUT_FILES, &
-          WRITE_SEISMOGRAMS_BY_MAIN, &
-          DT,NSTEP,NTSTEP_BETWEEN_OUTPUT_SAMPLE
+    NPROCTOT_VAL,myrank,nrec,nrec_local, &
+    number_receiver_global, &
+    seismograms, &
+    nlength_seismogram, &
+    seismo_offset,seismo_current, &
+    islice_num_rec_local, &
+    station_name,network_name, &
+    OUTPUT_SEISMOS_ASCII_TEXT, &
+    OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_SAC_BINARY, &
+    OUTPUT_SEISMOS_ASDF, &
+    OUTPUT_SEISMOS_3D_ARRAY, &
+    SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE, &
+    OUTPUT_FILES, &
+    WRITE_SEISMOGRAMS_BY_MAIN, &
+    DT,NSTEP,NTSTEP_BETWEEN_OUTPUT_SAMPLE
 
   implicit none
 
@@ -225,7 +245,7 @@ subroutine write_seismograms_to_file()
   character(len=MAX_STRING_LEN) :: sisname,staname
 
   ! allocates single station seismogram
-  allocate(one_seismogram(NDIM,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
+  allocate(one_seismogram(NDIM,nlength_seismogram),stat=ier)
   if (ier /= 0) call exit_mpi(myrank,'Error while allocating one temporary seismogram')
   one_seismogram(:,:) = 0.0_CUSTOM_REAL
 
@@ -237,7 +257,7 @@ subroutine write_seismograms_to_file()
       ! get global number of that receiver
       irec = number_receiver_global(irec_local)
 
-      one_seismogram(:,:) = seismograms(:,irec_local,::NTSTEP_BETWEEN_OUTPUT_SAMPLE)
+      one_seismogram(:,:) = seismograms(:,irec_local,:)
 
       ! write this seismogram
       ! note: ASDF data structure is given in module
@@ -342,7 +362,7 @@ subroutine write_seismograms_to_file()
         ! get global number of that receiver
         irec = number_receiver_global(irec_local)
 
-        one_seismogram(:,:) = seismograms(:,irec_local,::NTSTEP_BETWEEN_OUTPUT_SAMPLE)
+        one_seismogram(:,:) = seismograms(:,irec_local,:)
 
         ! write this seismogram
         ! note: ASDF data structure is given in module
@@ -391,40 +411,44 @@ subroutine write_seismograms_to_file()
         ! loop on all the slices
         do iproc = 0,NPROCTOT_VAL-1
 
-         ! communicates only with processes that contain local receivers (to minimize MPI chatter)
-         if (islice_num_rec_local(iproc) == 0 ) cycle
-
-         ! receive except from proc 0, which is me and therefore I already have this value
-         sender = iproc
-         if (iproc == 0) then
-           ! main is current slice
-           nrec_local_received = nrec_local
-         else
-           ! receives info from secondary processes
-           call recv_singlei(nrec_local_received,sender,itag)
-           if (nrec_local_received <= 0) call exit_MPI(myrank,'Error while receiving local number of receivers')
-         endif
-         if (nrec_local_received > 0) then
-           do irec_local = 1,nrec_local_received
-             ! receive except from proc 0, which is myself and therefore I already have these values
-             if (iproc == 0) then
-               ! get global number of that receiver
-               irec = number_receiver_global(irec_local)
-               one_seismogram(:,:) = seismograms(:,irec_local,::NTSTEP_BETWEEN_OUTPUT_SAMPLE)
-             else
-               ! receives info from secondary processes
-               call recv_singlei(irec,sender,itag)
-               if (irec < 1 .or. irec > nrec) call exit_MPI(myrank,'Error while receiving global receiver number')
-               call recv_cr(one_seismogram,NDIM*seismo_current,sender,itag)
-             endif
-
-             ! write this seismogram
-             call write_one_seismogram(one_seismogram,irec,irec_local,.false.)
-
-             ! counts seismos written
-             total_seismos = total_seismos + 1
-           enddo
-         endif
+          ! communicates only with processes that contain local receivers (to minimize MPI chatter)
+          if (islice_num_rec_local(iproc) == 0 ) cycle
+
+          ! receive except from proc 0, which is me and therefore I already have this value
+          sender = iproc
+          if (iproc == 0) then
+            ! main is current slice
+            nrec_local_received = nrec_local
+          else
+            ! receives info from secondary processes
+            call recv_singlei(nrec_local_received,sender,itag)
+            if (nrec_local_received <= 0) call exit_MPI(myrank,'Error while receiving local number of receivers')
+          endif
+
+          if (nrec_local_received > 0) then
+            do irec_local = 1,nrec_local_received
+              ! init trace
+              one_seismogram(:,:) = 0._CUSTOM_REAL
+
+              ! receive except from proc 0, which is myself and therefore I already have these values
+              if (iproc == 0) then
+                ! get global number of that receiver
+                irec = number_receiver_global(irec_local)
+                one_seismogram(:,:) = seismograms(:,irec_local,:)
+              else
+                ! receives info from secondary processes
+                call recv_singlei(irec,sender,itag)
+                if (irec < 1 .or. irec > nrec) call exit_MPI(myrank,'Error while receiving global receiver number')
+                call recv_cr(one_seismogram,NDIM*seismo_current,sender,itag)
+              endif
+
+              ! write this seismogram
+              call write_one_seismogram(one_seismogram,irec,irec_local,.false.)
+
+              ! counts seismos written
+              total_seismos = total_seismos + 1
+            enddo
+          endif
         enddo
 
       else
@@ -438,7 +462,7 @@ subroutine write_seismograms_to_file()
             irec = number_receiver_global(irec_local)
             call send_singlei(irec,receiver,itag)
 
-            one_seismogram(:,:) = seismograms(:,irec_local,::NTSTEP_BETWEEN_OUTPUT_SAMPLE)
+            one_seismogram(:,:) = seismograms(:,irec_local,:)
             call send_cr(one_seismogram,NDIM*seismo_current,receiver,itag)
           enddo
         endif
@@ -447,10 +471,10 @@ subroutine write_seismograms_to_file()
       ! only main process
       if (myrank == 0) then
         ! output info
-        write(IMAIN,*)
         write(IMAIN,*) 'Total number of receivers saved is ',total_seismos,' out of ',nrec
-        write(IMAIN,*)
+        call flush_IMAIN()
 
+        ! checks
         if (total_seismos /= nrec) call exit_MPI(myrank,'incorrect total number of receivers saved')
 
         ! create one large file instead of one small file per station to avoid file system overload
@@ -475,25 +499,25 @@ subroutine write_one_seismogram(one_seismogram,irec,irec_local,is_for_asdf)
     MAX_LENGTH_STATION_NAME,MAX_LENGTH_NETWORK_NAME
 
   use specfem_par, only: &
-          myrank, &
-          station_name,network_name,stlat,stlon, &
-          DT, &
-          seismo_current, &
-          OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_ASDF, &
-          OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-          NTSTEP_BETWEEN_OUTPUT_SAMPLE
+    myrank, &
+    station_name,network_name,stlat,stlon, &
+    DT, &
+    seismo_current, &
+    OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_ASDF, &
+    OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT, &
+    nlength_seismogram
 
   use specfem_par, only: &
-          cmt_lat => cmt_lat_SAC,cmt_lon => cmt_lon_SAC
+    cmt_lat => cmt_lat_SAC,cmt_lon => cmt_lon_SAC
 
   implicit none
 
   integer,intent(in) :: irec,irec_local
   logical,intent(in) :: is_for_asdf
-  real(kind=CUSTOM_REAL), dimension(NDIM,NTSTEP_BETWEEN_OUTPUT_SEISMOS),intent(in) :: one_seismogram
+  real(kind=CUSTOM_REAL), dimension(NDIM,nlength_seismogram),intent(in) :: one_seismogram
 
   ! local parameters
-  real(kind=CUSTOM_REAL), dimension(5,NTSTEP_BETWEEN_OUTPUT_SEISMOS) :: seismogram_tmp
+  real(kind=CUSTOM_REAL), dimension(5,nlength_seismogram) :: seismogram_tmp
   integer :: iorientation,length_station_name,length_network_name
 
   character(len=4) :: chn
@@ -507,11 +531,9 @@ subroutine write_one_seismogram(one_seismogram,irec,irec_local,is_for_asdf)
   double precision :: phi
   real(kind=CUSTOM_REAL) :: cphi,sphi
   integer :: isample
-  integer :: seismo_current_used
 
   ! initializes
   seismogram_tmp(:,:) = 0.0_CUSTOM_REAL
-  seismo_current_used = ceiling(real(seismo_current) / NTSTEP_BETWEEN_OUTPUT_SAMPLE)
 
   ! get band code
   call band_instrument_code(DT,bic)
@@ -524,7 +546,7 @@ subroutine write_one_seismogram(one_seismogram,irec,irec_local,is_for_asdf)
     ior_end   = 3    ! ending with Z => NEZ
   endif
 
-  do iorientation = ior_start,ior_end      ! BS BS changed according to ROTATE_SEISMOGRAMS_RT
+  do iorientation = ior_start,ior_end    ! changes according to ROTATE_SEISMOGRAMS_RT
 
     select case (iorientation)
     case (1)
@@ -546,9 +568,8 @@ subroutine write_one_seismogram(one_seismogram,irec,irec_local,is_for_asdf)
       call exit_MPI(myrank,'incorrect channel value')
     end select
 
-    if (iorientation == 4 .or. iorientation == 5) then        ! LMU BS BS
-
-      ! BS BS calculate backazimuth needed to rotate East and North
+    if (iorientation == 4 .or. iorientation == 5) then
+      ! calculate backazimuth needed to rotate East and North
       ! components to Radial and Transverse components
       ! (back-azimuth returned in degrees between [0,360])
       call get_backazimuth(cmt_lat,cmt_lon,stlat(irec),stlon(irec),backaz)
@@ -571,25 +592,24 @@ subroutine write_one_seismogram(one_seismogram,irec,irec_local,is_for_asdf)
       cphi = cos(phi*DEGREES_TO_RADIANS)
       sphi = sin(phi*DEGREES_TO_RADIANS)
 
-      ! BS BS do the rotation of the components and put result in
+      ! do the rotation of the components and put result in
       ! new variable seismogram_tmp
       if (iorientation == 4) then ! radial component
-         do isample = 1,seismo_current_used
-            seismogram_tmp(iorientation,isample) = &
+        do isample = 1,seismo_current
+          seismogram_tmp(iorientation,isample) = &
                cphi * one_seismogram(1,isample) + sphi * one_seismogram(2,isample)
-         enddo
+        enddo
       else if (iorientation == 5) then ! transverse component
-         do isample = 1,seismo_current_used
-            seismogram_tmp(iorientation,isample) = &
+        do isample = 1,seismo_current
+          seismogram_tmp(iorientation,isample) = &
             -1*sphi * one_seismogram(1,isample) + cphi * one_seismogram(2,isample)
-         enddo
+        enddo
       endif
-
-    else ! keep NEZ components
-      do isample = 1,seismo_current_used
+    else
+      ! keep NEZ components
+      do isample = 1,seismo_current
         seismogram_tmp(iorientation,isample) = one_seismogram(iorientation,isample)
       enddo
-
     endif
 
     ! create the name of the seismogram file for each slice
@@ -644,17 +664,17 @@ subroutine write_adj_seismograms()
 
   use constants, only: MAX_STRING_LEN,CUSTOM_REAL,IOUT
 
-  use specfem_par, only: NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS,NTSTEP_BETWEEN_OUTPUT_SAMPLE, &
+  use specfem_par, only: NTSTEP_BETWEEN_OUTPUT_SEISMOS,NTSTEP_BETWEEN_OUTPUT_SAMPLE, &
     DT,t0,OUTPUT_FILES, &
     seismograms,number_receiver_global,nrec_local, &
-    it,it_adj_written, &
+    it,seismo_current,seismo_offset, &
     myrank,WRITE_SEISMOGRAMS_BY_MAIN
 
   implicit none
 
   ! local parameters
   integer :: irec,irec_local,ier
-  integer :: iorientation,isample
+  integer :: iorientation,isample,it_tmp
   real(kind=CUSTOM_REAL) :: time_t
 
   character(len=4) :: chn
@@ -719,13 +739,17 @@ subroutine write_adj_seismograms()
       if (ier /= 0) call exit_mpi(myrank,'Error opening file: '//trim(OUTPUT_FILES)//trim(sisname))
 
       ! make sure we never write more than the maximum number of time steps
-      do isample = it_adj_written+1,min(it,NSTEP)
-        ! time
+      do isample = 1,seismo_current
+        ! current time
+        ! current time increment
+        it_tmp = seismo_offset + isample
+
         ! subtract onset time to make sure travel time is correct
         ! distinguish between single and double precision for reals
-        time_t = real(dble(isample-1)*DT*NTSTEP_BETWEEN_OUTPUT_SAMPLE - t0,kind=CUSTOM_REAL)
+        time_t = real(dble((it_tmp-1) * NTSTEP_BETWEEN_OUTPUT_SAMPLE) * DT - t0,kind=CUSTOM_REAL)
+
         ! output
-        write(IOUT,*) time_t,seismograms(iorientation,irec_local,isample-it_adj_written)
+        write(IOUT,*) time_t,seismograms(iorientation,irec_local,isample)
       enddo
 
       close(IOUT)