From 8b3ba603b1dfd73a3360647982ff0e72f3f685a8 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Thu, 19 Sep 2024 22:35:17 +0200
Subject: [PATCH 01/32] WIP, setup

---
 include/cufinufft/impl.h   | 15 +++++++++------
 include/finufft_errors.h   |  3 ++-
 src/cuda/1d/cufinufft1d.cu | 39 ++++++++++++++++++++++++--------------
 src/cuda/2d/cufinufft2d.cu | 33 +++++++++++++++++++++-----------
 src/cuda/3d/cufinufft3d.cu | 38 +++++++++++++++++++++++--------------
 src/cuda/cufinufft.cu      |  2 +-
 src/cuda/spreadinterp.cpp  |  2 +-
 7 files changed, 84 insertions(+), 48 deletions(-)
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 0913a404e..9942eb932 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -311,13 +311,16 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
                  d_fseries_precomp_phase.begin());
     thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
                  d_fseries_precomp_f.begin());
+    if(!d_plan->opts.gpu_spreadinterponly) {
     // the full fseries is done on the GPU here
-    if ((ier = fseries_kernel_compute(
-             d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
-             d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
-             d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
-             d_plan->spopts.nspread, stream)))
-      goto finalize;
+        if ((ier = fseries_kernel_compute(
+                 d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
+                 d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
+                 d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
+                 d_plan->spopts.nspread, stream)))
+          goto finalize;
+    }
+    
   }
 finalize:
   if (ier > 1) {
diff --git a/include/finufft_errors.h b/include/finufft_errors.h
index 10465aa13..86158c219 100644
--- a/include/finufft_errors.h
+++ b/include/finufft_errors.h
@@ -25,6 +25,7 @@ enum {
   FINUFFT_ERR_INSUFFICIENT_SHMEM     = 19,
   FINUFFT_ERR_NUM_NU_PTS_INVALID     = 20,
   FINUFFT_ERR_INVALID_ARGUMENT       = 21,
-  FINUFFT_ERR_LOCK_FUNS_INVALID      = 22
+  FINUFFT_ERR_LOCK_FUNS_INVALID      = 22,
+  FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID  = 23,
 };
 #endif
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index 06389ef75..3689b1b3b 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -43,8 +43,10 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms;
     d_plan->c   = d_cstart;
     d_plan->fk  = d_fkstart;
-
-    // this is needed
+    if (d_plan->opts.gpu_spreadinterponly)
+        d_plan->fw = d_fkstart;
+    
+        // this is needed
     if ((ier = checkCudaErrors(cudaMemsetAsync(
              d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
              stream))))
@@ -52,7 +54,11 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     // Step 1: Spread
     if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
+    // Step 1.5: if spreadonly, skip the rest
 
+    if (d_plan->opts.gpu_spreadinterponly)
+        continue;
+    
     // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
@@ -97,19 +103,24 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
-
-    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-    if (d_plan->opts.modeord == 0) {
-      if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
-    } else {
-      if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
+    
+    // Skip all steps if interponly
+    if (!d_plan->opts.gpu_spreadinterponly) {
+        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+        if (d_plan->opts.modeord == 0) {
+          if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
+        } else {
+          if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
+        }
+
+        // Step 2: FFT
+        cufftResult cufft_status =
+            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
     }
-
-    // Step 2: FFT
-    cufftResult cufft_status =
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-
+    else
+        d_plan->fw = d_fkstart;
+        
     // Step 3: deconvolve and shuffle
     if ((ier = cuinterp1d<T>(d_plan, blksize))) return ier;
   }
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index 8c165edbf..97202e33d 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -44,6 +44,8 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt;
     d_plan->c   = d_cstart;
     d_plan->fk  = d_fkstart;
+    if (d_plan->opts.gpu_spreadinterponly)
+        d_plan->fw = d_fkstart;
 
     // this is needed
     if ((ier = checkCudaErrors(cudaMemsetAsync(
@@ -55,6 +57,10 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     // Step 1: Spread
     if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;
 
+    // Step 1.5: if spreadonly, skip the rest
+    if (d_plan->opts.gpu_spreadinterponly)
+        continue;
+    
     // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
@@ -100,18 +106,23 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-    if (d_plan->opts.modeord == 0) {
-      if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
-    } else {
-      if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
+    // Skip all steps if interponly
+    if (!d_plan->opts.gpu_spreadinterponly) {
+        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+        if (d_plan->opts.modeord == 0) {
+          if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
+        } else {
+          if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
+        }
+
+        // Step 2: FFT
+        cufftResult cufft_status =
+            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
     }
-
-    // Step 2: FFT
-    cufftResult cufft_status =
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-
+    else
+        d_plan->fw = d_fkstart;
+    
     // Step 3: deconvolve and shuffle
     if ((ier = cuinterp2d<T>(d_plan, blksize))) return ier;
   }
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index ce1f37c0e..6572243c6 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -42,6 +42,8 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
+    if (d_plan->opts.gpu_spreadinterponly)
+        d_plan->fw = d_fkstart;
 
     if ((ier = checkCudaErrors(cudaMemsetAsync(
              d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
@@ -50,8 +52,12 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     // Step 1: Spread
     if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;
-
-    // Step 2: FFT
+    
+    // Step 1.5: if spreadonly, skip the rest
+    if (d_plan->opts.gpu_spreadinterponly)
+        continue;
+    
+        // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
     if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
@@ -94,19 +100,23 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-    if (d_plan->opts.modeord == 0) {
-      if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
-    } else {
-      if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
+    // Skip all steps if interponly
+    if (!d_plan->opts.gpu_spreadinterponly) {
+            // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+        if (d_plan->opts.modeord == 0) {
+          if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
+        } else {
+          if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
+        }
+        // Step 2: FFT
+        RETURN_IF_CUDA_ERROR
+        cufftResult cufft_status =
+            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
     }
-
-    // Step 2: FFT
-    RETURN_IF_CUDA_ERROR
-    cufftResult cufft_status =
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-
+    else
+        d_plan->fw = d_fkstart;
+        
     // Step 3: deconvolve and shuffle
     if ((ier = cuinterp3d<T>(d_plan, blksize))) return ier;
   }
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
index 534fa5358..85b382055 100644
--- a/src/cuda/cufinufft.cu
+++ b/src/cuda/cufinufft.cu
@@ -121,7 +121,7 @@ void cufinufft_default_opts(cufinufft_opts *opts)
   opts->gpu_binsizey       = 0;
   opts->gpu_binsizez       = 0;
   opts->gpu_maxbatchsize   = 0;
-  opts->debug              = 0;
+  opts->debug              = 1;
   opts->gpu_stream         = cudaStreamDefault;
   // sphinx tag (don't remove): @gpu_defopts_end
 }
diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp
index 646ffa434..dba38ffb2 100644
--- a/src/cuda/spreadinterp.cpp
+++ b/src/cuda/spreadinterp.cpp
@@ -26,7 +26,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
               __func__, upsampfac);
       return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
-    if (upsampfac <= 1.0) {
+    if (upsampfac < 1.0) {
       fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
       return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
     }

From f322afe6dbeb800abac0328f633ab42de305c186 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Fri, 20 Sep 2024 10:36:08 +0200
Subject: [PATCH 02/32] Minor updates

---
 python/cufinufft/pyproject.toml |  2 +-
 src/cuda/memtransfer_wrapper.cu | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/cufinufft/pyproject.toml b/python/cufinufft/pyproject.toml
index 694635c00..f9c74afb2 100644
--- a/python/cufinufft/pyproject.toml
+++ b/python/cufinufft/pyproject.toml
@@ -41,7 +41,7 @@ build-dir = "build/{wheel_tag}"
 # Tell skbuild to where to find CMakeLists.txt.
 cmake.source-dir = "../../"
 cmake.targets = ["cufinufft"]
-cmake.define = {"FINUFFT_BUILD_PYTHON" = "ON", "FINUFFT_USE_CUDA" = "ON", "FINUFFT_USE_CPU" = "OFF"}
+cmake.define = {"FINUFFT_BUILD_PYTHON" = "ON", "FINUFFT_USE_CUDA" = "ON", "FINUFFT_USE_CPU" = "OFF", "CMAKE_BUILD_TYPE"="Debug"}
 
 wheel.packages = ["cufinufft"]
 
diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
index e5308e8bc..f84d8fb64 100644
--- a/src/cuda/memtransfer_wrapper.cu
+++ b/src/cuda/memtransfer_wrapper.cu
@@ -434,12 +434,13 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
   // Fixes a crash whewre the plan itself is deleted before the stream
   const auto stream = d_plan->stream;
-
-  CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
-  CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
-  CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
-  CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
-
+  if(!d_plan->opts.gpu_spreadinterponly) {
+    CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
+  }
+  
   CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->numsubprob, stream, d_plan->supports_pools);

From 135d504730f2864c7fb930c803329ed1d236efec Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Fri, 20 Sep 2024 11:13:56 +0200
Subject: [PATCH 03/32] remove wip

---
 python/cufinufft/pyproject.toml | 2 +-
 src/cuda/cufinufft.cu           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cufinufft/pyproject.toml b/python/cufinufft/pyproject.toml
index f9c74afb2..b20b3d4dc 100644
--- a/python/cufinufft/pyproject.toml
+++ b/python/cufinufft/pyproject.toml
@@ -41,7 +41,7 @@ build-dir = "build/{wheel_tag}"
 # Tell skbuild to where to find CMakeLists.txt.
 cmake.source-dir = "../../"
 cmake.targets = ["cufinufft"]
-cmake.define = {"FINUFFT_BUILD_PYTHON" = "ON", "FINUFFT_USE_CUDA" = "ON", "FINUFFT_USE_CPU" = "OFF", "CMAKE_BUILD_TYPE"="Debug"}
+cmake.define = {"FINUFFT_BUILD_PYTHON" = "ON", "FINUFFT_USE_CUDA" = "ON", "FINUFFT_USE_CPU" = "OFF"}#, "CMAKE_BUILD_TYPE"="Debug"}
 
 wheel.packages = ["cufinufft"]
 
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
index 85b382055..534fa5358 100644
--- a/src/cuda/cufinufft.cu
+++ b/src/cuda/cufinufft.cu
@@ -121,7 +121,7 @@ void cufinufft_default_opts(cufinufft_opts *opts)
   opts->gpu_binsizey       = 0;
   opts->gpu_binsizez       = 0;
   opts->gpu_maxbatchsize   = 0;
-  opts->debug              = 1;
+  opts->debug              = 0;
   opts->gpu_stream         = cudaStreamDefault;
   // sphinx tag (don't remove): @gpu_defopts_end
 }

From 909f3f5f28a6af61b840b680d92f292453a43bbc Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Fri, 20 Sep 2024 11:30:18 +0200
Subject: [PATCH 04/32] update debug and also remove unwanted

---
 include/cufinufft/impl.h        | 4 +++-
 python/cufinufft/pyproject.toml | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 9942eb932..ab14a0bfb 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -107,7 +107,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     d_plan->ms = nmodes[0];
     d_plan->mt = nmodes[1];
     d_plan->mu = nmodes[2];
-    printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
+    if (d_plan->opts.debug) {
+        printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
+    }
   } else {
     d_plan->opts.gpu_spreadinterponly = 1;
   }
diff --git a/python/cufinufft/pyproject.toml b/python/cufinufft/pyproject.toml
index b20b3d4dc..694635c00 100644
--- a/python/cufinufft/pyproject.toml
+++ b/python/cufinufft/pyproject.toml
@@ -41,7 +41,7 @@ build-dir = "build/{wheel_tag}"
 # Tell skbuild to where to find CMakeLists.txt.
 cmake.source-dir = "../../"
 cmake.targets = ["cufinufft"]
-cmake.define = {"FINUFFT_BUILD_PYTHON" = "ON", "FINUFFT_USE_CUDA" = "ON", "FINUFFT_USE_CPU" = "OFF"}#, "CMAKE_BUILD_TYPE"="Debug"}
+cmake.define = {"FINUFFT_BUILD_PYTHON" = "ON", "FINUFFT_USE_CUDA" = "ON", "FINUFFT_USE_CPU" = "OFF"}
 
 wheel.packages = ["cufinufft"]
 

From 64ee309ed570f69692c8cd5d68730ba6cf612b49 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Fri, 20 Sep 2024 12:03:50 +0200
Subject: [PATCH 05/32] Added error

---
 include/cufinufft/impl.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index ab14a0bfb..6c395a8b2 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -155,7 +155,10 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
     }
   }
-
+  if (d_plan->opts.gpu_spreadinterponly && d_plan->opts.upsampfac != 1) {
+    ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
+    goto finalize;
+  }
   /* Setup Spreader */
   if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
     // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK

From 70fa5c60477e8426a228b92d842c707a4e23bf31 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Fri, 20 Sep 2024 12:04:45 +0200
Subject: [PATCH 06/32] Add error rst update

---
 docs/error.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/error.rst b/docs/error.rst
index 9d9edc9a4..6e12d2859 100644
--- a/docs/error.rst
+++ b/docs/error.rst
@@ -29,6 +29,7 @@ has the following meanings (see codes in ``include/finufft_errors.h``):
   18 size of bins for subprob/blockgather invalid
   19 GPU shmem too small for subprob/blockgather parameters
   20 invalid number of nonuniform points: nj or nk negative, or too big (see defs.h)
+  23 invalid upsampfac set while using gpu_spreadinterponly mode.
 
 When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable.
 

From 8bb5e8e89ebc1cca8ce2d831492889a44e5c9d36 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Mon, 23 Sep 2024 10:42:54 +0200
Subject: [PATCH 07/32] Update and take in comments

---
 src/cuda/1d/cufinufft1d.cu | 6 +++---
 src/cuda/2d/cufinufft2d.cu | 6 +++---
 src/cuda/3d/cufinufft3d.cu | 8 ++++----
 src/cuda/spreadinterp.cpp  | 4 +++-
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index 3689b1b3b..d0328236f 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -54,7 +54,7 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     // Step 1: Spread
     if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
-    // Step 1.5: if spreadonly, skip the rest
+    // if spreadonly, skip the rest
 
     if (d_plan->opts.gpu_spreadinterponly)
         continue;
@@ -104,7 +104,7 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
     
-    // Skip all steps if interponly
+    // Skip steps 1 and 2 if interponly
     if (!d_plan->opts.gpu_spreadinterponly) {
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
         if (d_plan->opts.modeord == 0) {
@@ -121,7 +121,7 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     else
         d_plan->fw = d_fkstart;
         
-    // Step 3: deconvolve and shuffle
+    // Step 3: Interpolate
     if ((ier = cuinterp1d<T>(d_plan, blksize))) return ier;
   }
 
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index 97202e33d..916c4c71d 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -57,7 +57,7 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     // Step 1: Spread
     if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;
 
-    // Step 1.5: if spreadonly, skip the rest
+    // if spreadonly, skip the rest
     if (d_plan->opts.gpu_spreadinterponly)
         continue;
     
@@ -106,7 +106,7 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Skip all steps if interponly
+    // Skip steps 1 and 2 if interponly
     if (!d_plan->opts.gpu_spreadinterponly) {
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
         if (d_plan->opts.modeord == 0) {
@@ -123,7 +123,7 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     else
         d_plan->fw = d_fkstart;
     
-    // Step 3: deconvolve and shuffle
+    // Step 3: Interpolate
     if ((ier = cuinterp2d<T>(d_plan, blksize))) return ier;
   }
 
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index 6572243c6..9f376297c 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -53,7 +53,7 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     // Step 1: Spread
     if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;
     
-    // Step 1.5: if spreadonly, skip the rest
+    // if spreadonly, skip the rest
     if (d_plan->opts.gpu_spreadinterponly)
         continue;
     
@@ -100,9 +100,9 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Skip all steps if interponly
+    // Skip steps 1 and 2 if interponly
     if (!d_plan->opts.gpu_spreadinterponly) {
-            // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
         if (d_plan->opts.modeord == 0) {
           if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
         } else {
@@ -117,7 +117,7 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     else
         d_plan->fw = d_fkstart;
         
-    // Step 3: deconvolve and shuffle
+    // Step 3: Interpolate
     if ((ier = cuinterp3d<T>(d_plan, blksize))) return ier;
   }
 
diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp
index dba38ffb2..9cc9b0427 100644
--- a/src/cuda/spreadinterp.cpp
+++ b/src/cuda/spreadinterp.cpp
@@ -26,7 +26,9 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
               __func__, upsampfac);
       return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
-    if (upsampfac < 1.0) {
+    if (upsampfac <= 1.0) {
+      if (upsampfac ==  1.0 && opts.gpu_spreadinterponly)
+        continue; // Upsampfac == 1.0 is valid for spreadinterponly mode.
       fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
       return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
     }

From eb0c310f93af3f7836bec301e4c5dd72b7e9eaec Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Mon, 23 Sep 2024 11:13:28 +0200
Subject: [PATCH 08/32] Update the docs

---
 docs/c_gpu.rst | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst
index 97c50b31a..a5f050220 100644
--- a/docs/c_gpu.rst
+++ b/docs/c_gpu.rst
@@ -311,11 +311,8 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove
 
 **gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented]
 
-Diagnostic options
-~~~~~~~~~~~~~~~~~~
-
 **gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and kernel FT deconvolution steps and return garbage answers.
-Nonzero value is *only* to be used to aid timing tests (although currently there are no timing codes that exploit this option), and will give wrong or undefined answers for the NUFFT transforms!
+Nonzero value can be used *only* with `upsampfac=1` and `kerevalmeth=1` for using cufinufft for only spread and interpolate mode. This has applications into estimating the density compensation, which is conventionally used in MRI. (See [MRI-NUFFT](https://mind-inria.github.io/mri-nufft/nufft.html))
 
 
 Algorithm performance options

From c0da6763faf5364420d0863b16e24f0214c78ff4 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Mon, 23 Sep 2024 11:25:04 +0200
Subject: [PATCH 09/32] some fixes

---
 python/cufinufft/pyproject.toml | 2 +-
 src/cuda/spreadinterp.cpp       | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/cufinufft/pyproject.toml b/python/cufinufft/pyproject.toml
index 694635c00..cf3f46310 100644
--- a/python/cufinufft/pyproject.toml
+++ b/python/cufinufft/pyproject.toml
@@ -41,7 +41,7 @@ build-dir = "build/{wheel_tag}"
 # Tell skbuild to where to find CMakeLists.txt.
 cmake.source-dir = "../../"
 cmake.targets = ["cufinufft"]
-cmake.define = {"FINUFFT_BUILD_PYTHON" = "ON", "FINUFFT_USE_CUDA" = "ON", "FINUFFT_USE_CPU" = "OFF"}
+cmake.define = {"FINUFFT_BUILD_PYTHON" = "ON", "FINUFFT_USE_CUDA" = "ON", "FINUFFT_USE_CPU" = "OFF", "CMAKE_BUILD_TYPE" = "Debug"}
 
 wheel.packages = ["cufinufft"]
 
diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp
index 9cc9b0427..f9c1de2ad 100644
--- a/src/cuda/spreadinterp.cpp
+++ b/src/cuda/spreadinterp.cpp
@@ -26,10 +26,9 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
               __func__, upsampfac);
       return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
-    if (upsampfac <= 1.0) {
-      if (upsampfac ==  1.0 && opts.gpu_spreadinterponly)
-        continue; // Upsampfac == 1.0 is valid for spreadinterponly mode.
-      fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
+    // Upsampfac == 1.0 is valid for spreadinterponly mode.
+    if (upsampfac < 1.0) {
+      fprintf(stderr, "[%s] error: upsampfac=%.3g is <1.0\n", __func__, upsampfac);
       return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
     }
     // calling routine must abort on above errors, since opts is garbage!

From feff14bd69f9cbeab5df040da7caf336bcd3932a Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Mon, 23 Sep 2024 11:46:36 +0200
Subject: [PATCH 10/32] Remove by mistake commit

---
 python/cufinufft/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cufinufft/pyproject.toml b/python/cufinufft/pyproject.toml
index cf3f46310..694635c00 100644
--- a/python/cufinufft/pyproject.toml
+++ b/python/cufinufft/pyproject.toml
@@ -41,7 +41,7 @@ build-dir = "build/{wheel_tag}"
 # Tell skbuild to where to find CMakeLists.txt.
 cmake.source-dir = "../../"
 cmake.targets = ["cufinufft"]
-cmake.define = {"FINUFFT_BUILD_PYTHON" = "ON", "FINUFFT_USE_CUDA" = "ON", "FINUFFT_USE_CPU" = "OFF", "CMAKE_BUILD_TYPE" = "Debug"}
+cmake.define = {"FINUFFT_BUILD_PYTHON" = "ON", "FINUFFT_USE_CUDA" = "ON", "FINUFFT_USE_CPU" = "OFF"}
 
 wheel.packages = ["cufinufft"]
 

From cc0f931463b892a9d1aa613c02bb3f591351305f Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Mon, 23 Sep 2024 12:12:51 +0200
Subject: [PATCH 11/32] Type 3 conflict fix and update comments

---
 include/cufinufft/impl.h        |  2 +-
 src/cuda/memtransfer_wrapper.cu | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 6c395a8b2..dcb927d8a 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -155,7 +155,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
     }
   }
-  if (d_plan->opts.gpu_spreadinterponly && d_plan->opts.upsampfac != 1) {
+  if (d_plan->opts.gpu_spreadinterponly && d_plan->opts.upsampfac != 1 && d_plan->type != 3) {
     ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
     goto finalize;
   }
diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
index f84d8fb64..fe1817fe1 100644
--- a/src/cuda/memtransfer_wrapper.cu
+++ b/src/cuda/memtransfer_wrapper.cu
@@ -434,12 +434,10 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
   // Fixes a crash whewre the plan itself is deleted before the stream
   const auto stream = d_plan->stream;
-  if(!d_plan->opts.gpu_spreadinterponly) {
-    CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
-  }
+  CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
   
   CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);

From 0271162d31c5e49fff918ce50f1d1ebb1bb23734 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Mon, 23 Sep 2024 12:30:54 +0200
Subject: [PATCH 12/32] Add back earlier line

---
 src/cuda/memtransfer_wrapper.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
index fe1817fe1..f84d8fb64 100644
--- a/src/cuda/memtransfer_wrapper.cu
+++ b/src/cuda/memtransfer_wrapper.cu
@@ -434,10 +434,12 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
   // Fixes a crash whewre the plan itself is deleted before the stream
   const auto stream = d_plan->stream;
-  CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
-  CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
-  CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
-  CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
+  if(!d_plan->opts.gpu_spreadinterponly) {
+    CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
+  }
   
   CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);

From 79f7c1f2c1ff6ade8d8d699249f41063b2adae83 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Mon, 23 Sep 2024 21:13:46 +0200
Subject: [PATCH 13/32] Fix for type3

---
 src/cuda/1d/cufinufft1d.cu | 4 ++--
 src/cuda/2d/cufinufft2d.cu | 4 ++--
 src/cuda/3d/cufinufft3d.cu | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index d0328236f..028583936 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -104,8 +104,8 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
     
-    // Skip steps 1 and 2 if interponly
-    if (!d_plan->opts.gpu_spreadinterponly) {
+    // Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
+    if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
         if (d_plan->opts.modeord == 0) {
           if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index 916c4c71d..67013549c 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -106,8 +106,8 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Skip steps 1 and 2 if interponly
-    if (!d_plan->opts.gpu_spreadinterponly) {
+    // Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
+    if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
         if (d_plan->opts.modeord == 0) {
           if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index 9f376297c..8d897334c 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -100,8 +100,8 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Skip steps 1 and 2 if interponly
-    if (!d_plan->opts.gpu_spreadinterponly) {
+    // Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
+    if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
         if (d_plan->opts.modeord == 0) {
           if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;

From 7845b9c37ad14930d2b8f0cbd76f1bd1a12233bf Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Mon, 23 Sep 2024 21:28:13 +0200
Subject: [PATCH 14/32] move to XOR, missed it earlier

---
 include/cufinufft/impl.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index dcb927d8a..26fe12d35 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -155,9 +155,11 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
     }
   }
-  if (d_plan->opts.gpu_spreadinterponly && d_plan->opts.upsampfac != 1 && d_plan->type != 3) {
-    ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
-    goto finalize;
+  if (d_plan->opts.gpu_spreadinterponly) {
+    if (d_plan->opts.upsampfac != 1 ^^ d_plan->type != 3) {
+        ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
+        goto finalize;
+    }
   }
   /* Setup Spreader */
   if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {

From 95f1a74ae69e867af8cd8ff71ec2027ddf0dfb38 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Mon, 23 Sep 2024 21:41:04 +0200
Subject: [PATCH 15/32] It was actually XNOR :P

---
 include/cufinufft/impl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 26fe12d35..c43510dd8 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -156,7 +156,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     }
   }
   if (d_plan->opts.gpu_spreadinterponly) {
-    if (d_plan->opts.upsampfac != 1 ^^ d_plan->type != 3) {
+    // XNOR implementation below with boolean logic.
+    if ((d_plan->opts.upsampfac !=1) == (d_plan->type != 3)) {
         ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
         goto finalize;
     }

From b13a07f3025fe4166680cc144d1dbb5724cd329c Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Tue, 24 Sep 2024 09:46:10 +0200
Subject: [PATCH 16/32] revert fixes, problem was elsewhere

---
 include/cufinufft/impl.h        |  2 +-
 src/cuda/1d/cufinufft1d.cu      |  4 ++--
 src/cuda/2d/cufinufft2d.cu      |  4 ++--
 src/cuda/3d/cufinufft3d.cu      |  4 ++--
 src/cuda/memtransfer_wrapper.cu | 10 ++++------
 5 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index c43510dd8..e70ad0b86 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -157,7 +157,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   }
   if (d_plan->opts.gpu_spreadinterponly) {
     // XNOR implementation below with boolean logic.
-    if ((d_plan->opts.upsampfac !=1) == (d_plan->type != 3)) {
+    if ((d_plan->opts.upsampfac !=1) == (type != 3)) {
         ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
         goto finalize;
     }
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index 028583936..d0328236f 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -104,8 +104,8 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
     
-    // Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
-    if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
+    // Skip steps 1 and 2 if interponly
+    if (!d_plan->opts.gpu_spreadinterponly) {
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
         if (d_plan->opts.modeord == 0) {
           if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index 67013549c..916c4c71d 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -106,8 +106,8 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
-    if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
+    // Skip steps 1 and 2 if interponly
+    if (!d_plan->opts.gpu_spreadinterponly) {
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
         if (d_plan->opts.modeord == 0) {
           if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index 8d897334c..9f376297c 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -100,8 +100,8 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
 
-    // Skip steps 1 and 2 if interponly, but if it is type3 (upsampfac > 1.0), we need to do step 1
-    if (!d_plan->opts.gpu_spreadinterponly || d_plan->opts.upsampfac > 1.0) {
+    // Skip steps 1 and 2 if interponly
+    if (!d_plan->opts.gpu_spreadinterponly) {
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
         if (d_plan->opts.modeord == 0) {
           if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
index f84d8fb64..fe1817fe1 100644
--- a/src/cuda/memtransfer_wrapper.cu
+++ b/src/cuda/memtransfer_wrapper.cu
@@ -434,12 +434,10 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
   // Fixes a crash whewre the plan itself is deleted before the stream
   const auto stream = d_plan->stream;
-  if(!d_plan->opts.gpu_spreadinterponly) {
-    CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
-  }
+  CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);
   
   CUDA_FREE_AND_NULL(d_plan->idxnupts, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->sortidx, stream, d_plan->supports_pools);

From ffd9b9f77237007240536fb3e511a8d304320d68 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Tue, 24 Sep 2024 10:07:08 +0200
Subject: [PATCH 17/32] Minor memory fix

---
 src/cuda/memtransfer_wrapper.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
index fe1817fe1..20bbb6355 100644
--- a/src/cuda/memtransfer_wrapper.cu
+++ b/src/cuda/memtransfer_wrapper.cu
@@ -434,7 +434,9 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
   utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
   // Fixes a crash whewre the plan itself is deleted before the stream
   const auto stream = d_plan->stream;
-  CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
+  // Dont clear fw if spreadinterponly for type 1 and 2 as fw belongs to original program (it is d_fk)
+  if(!d_plan->opts.gpu_spreadinterponly || d_plan->type == 3)
+    CUDA_FREE_AND_NULL(d_plan->fw, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream, d_plan->supports_pools);
   CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream, d_plan->supports_pools);

From 22faffaa8ce3993468329f25c04e182b76ae1d90 Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Tue, 24 Sep 2024 10:09:03 +0200
Subject: [PATCH 18/32] Actually, we dont even need cufft plans!

---
 include/cufinufft/impl.h | 116 +++++++++++++++++++--------------------
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index e70ad0b86..ea5a91a4e 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -262,65 +262,65 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     } break;
     }
 
-    cufftHandle fftplan;
-    cufftResult_t cufft_status;
-    switch (d_plan->dim) {
-    case 1: {
-      int n[]       = {(int)nf1};
-      int inembed[] = {(int)nf1};
-
-      cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
-                                   inembed[0], cufft_type<T>(), batchsize);
-    } break;
-    case 2: {
-      int n[]       = {(int)nf2, (int)nf1};
-      int inembed[] = {(int)nf2, (int)nf1};
-
-      cufft_status =
-          cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
-                        inembed[0] * inembed[1], cufft_type<T>(), batchsize);
-    } break;
-    case 3: {
-      int n[]       = {(int)nf3, (int)nf2, (int)nf1};
-      int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
-
-      cufft_status = cufftPlanMany(
-          &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
-          inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
-    } break;
-    }
-
-    if (cufft_status != CUFFT_SUCCESS) {
-      fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
-              cufftGetErrorString(cufft_status));
-      ier = FINUFFT_ERR_CUDA_FAILURE;
-      goto finalize;
-    }
-    cufftSetStream(fftplan, stream);
-
-    d_plan->fftplan = fftplan;
-
-    // compute up to 3 * NQUAD precomputed values on CPU
-    T fseries_precomp_phase[3 * MAX_NQUAD];
-    T fseries_precomp_f[3 * MAX_NQUAD];
-    thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
-    thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
-    onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
-                                     fseries_precomp_phase, d_plan->spopts);
-    if (d_plan->dim > 1)
-      onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
-                                       fseries_precomp_phase + MAX_NQUAD, d_plan->spopts);
-    if (d_plan->dim > 2)
-      onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
-                                       fseries_precomp_phase + 2 * MAX_NQUAD,
-                                       d_plan->spopts);
-    // copy the precomputed data to the device using thrust
-    thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
-                 d_fseries_precomp_phase.begin());
-    thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
-                 d_fseries_precomp_f.begin());
     if(!d_plan->opts.gpu_spreadinterponly) {
-    // the full fseries is done on the GPU here
+        cufftHandle fftplan;
+        cufftResult_t cufft_status;
+        switch (d_plan->dim) {
+        case 1: {
+          int n[]       = {(int)nf1};
+          int inembed[] = {(int)nf1};
+
+          cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
+                                       inembed[0], cufft_type<T>(), batchsize);
+        } break;
+        case 2: {
+          int n[]       = {(int)nf2, (int)nf1};
+          int inembed[] = {(int)nf2, (int)nf1};
+
+          cufft_status =
+              cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
+                            inembed[0] * inembed[1], cufft_type<T>(), batchsize);
+        } break;
+        case 3: {
+          int n[]       = {(int)nf3, (int)nf2, (int)nf1};
+          int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
+
+          cufft_status = cufftPlanMany(
+              &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
+              inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
+        } break;
+        }
+
+        if (cufft_status != CUFFT_SUCCESS) {
+          fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
+                  cufftGetErrorString(cufft_status));
+          ier = FINUFFT_ERR_CUDA_FAILURE;
+          goto finalize;
+        }
+        cufftSetStream(fftplan, stream);
+
+        d_plan->fftplan = fftplan;
+
+        // compute up to 3 * NQUAD precomputed values on CPU
+        T fseries_precomp_phase[3 * MAX_NQUAD];
+        T fseries_precomp_f[3 * MAX_NQUAD];
+        thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
+        thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
+        onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
+                                         fseries_precomp_phase, d_plan->spopts);
+        if (d_plan->dim > 1)
+          onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
+                                           fseries_precomp_phase + MAX_NQUAD, d_plan->spopts);
+        if (d_plan->dim > 2)
+          onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
+                                           fseries_precomp_phase + 2 * MAX_NQUAD,
+                                           d_plan->spopts);
+        // copy the precomputed data to the device using thrust
+        thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
+                     d_fseries_precomp_phase.begin());
+        thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
+                     d_fseries_precomp_f.begin());
+        // the full fseries is done on the GPU here
         if ((ier = fseries_kernel_compute(
                  d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
                  d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),

From 3d85c5942ac9c08b6179ed039908813cf411fcee Mon Sep 17 00:00:00 2001
From: Chaithya G R <chaithyagr@gmail.com>
Date: Tue, 24 Sep 2024 10:10:56 +0200
Subject: [PATCH 19/32] Actually, we dont even need cufft plans!

---
 include/cufinufft/impl.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index ea5a91a4e..0ec67c818 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -262,6 +262,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     } break;
     }
 
+    // We dont need any cuFFT plans or kernel values if we are only spreading / interpolating
     if(!d_plan->opts.gpu_spreadinterponly) {
         cufftHandle fftplan;
         cufftResult_t cufft_status;

From 9e88e801e995953a20fa56c02a7212dbdf8f7b62 Mon Sep 17 00:00:00 2001
From: ahbarnett <alex.h.barnett@gmail.com>
Date: Thu, 26 Sep 2024 17:09:10 -0400
Subject: [PATCH 20/32] add packaging Py pkg requirement to both CPU + GPU

---
 python/cufinufft/requirements.txt | 1 +
 python/finufft/requirements.txt   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/python/cufinufft/requirements.txt b/python/cufinufft/requirements.txt
index bc2cbbd1c..f63d1ba35 100644
--- a/python/cufinufft/requirements.txt
+++ b/python/cufinufft/requirements.txt
@@ -1,2 +1,3 @@
 numpy
 six
+packaging
diff --git a/python/finufft/requirements.txt b/python/finufft/requirements.txt
index 3bfbce9d8..10c84a7b9 100644
--- a/python/finufft/requirements.txt
+++ b/python/finufft/requirements.txt
@@ -1 +1,2 @@
 numpy >= 1.12.0
+packaging

From 63bcb4a77d3fd8860ca8a14dd2871159352f6958 Mon Sep 17 00:00:00 2001
From: ahbarnett <alex.h.barnett@gmail.com>
Date: Thu, 26 Sep 2024 17:22:40 -0400
Subject: [PATCH 21/32] makefile skip FFTW-related test(s) if FFT=DUCC; would
 have been caught in CI if CI tested DUCC :)  Fixes #569

---
 makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/makefile b/makefile
index 4a91506db..d30afccf9 100644
--- a/makefile
+++ b/makefile
@@ -284,6 +284,10 @@ test/testutilsf: test/testutils.cpp src/utils_32.o src/utils_precindep.o
 
 # make sure all double-prec test executables ready for testing
 TESTS := $(basename $(wildcard test/*.cpp))
+# kill off FFTW-specific tests if it's not the FFT we build with...
+ifeq ($(FFT),DUCC)
+  TESTS := $(filter-out $(basename $(wildcard test/*fftw*.cpp)),$(TESTS))
+endif
 # also need single-prec
 TESTS += $(TESTS:%=%f)
 test: $(TESTS)

From 5f019b04b1e3c0de61ece05f9ea33fe834237208 Mon Sep 17 00:00:00 2001
From: ahbarnett <alex.h.barnett@gmail.com>
Date: Fri, 4 Oct 2024 17:08:43 -0400
Subject: [PATCH 22/32] fix CFLAGS in make.inc.macosx_clang_matlab to find
 omp.h as per Manas

---
 make.inc.macosx_clang_matlab | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/make.inc.macosx_clang_matlab b/make.inc.macosx_clang_matlab
index c856d537b..739215476 100644
--- a/make.inc.macosx_clang_matlab
+++ b/make.inc.macosx_clang_matlab
@@ -21,7 +21,7 @@ CXX=clang++
 CC=clang
 
 # taken from makefile...
-CFLAGS   += -I include -I/usr/local/include -I/opt/homebrew/include
+CFLAGS   += -I include -I/usr/local/include -I/usr/local/opt/libomp/include -I/opt/homebrew/include
 FFLAGS   = $(CFLAGS)
 CXXFLAGS = $(CFLAGS)
 LIBS += -L/usr/local/lib -L/opt/homebrew/lib

From cf25e433ac2945b4a7c7ba6ca4bc671dd0673ab0 Mon Sep 17 00:00:00 2001
From: ahbarnett <alex.h.barnett@gmail.com>
Date: Tue, 8 Oct 2024 13:52:59 -0400
Subject: [PATCH 23/32] improved docs for gpu_spreadinterponly and upsampfac=1
 advanced use case; CHANGELOG

---
 CHANGELOG      | 12 +++++++-----
 docs/c_gpu.rst |  9 ++++++---
 docs/error.rst |  2 +-
 docs/opts.rst  | 10 ++++++----
 4 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 37f4c9c80..ab1eead32 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,10 +1,12 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
-Master (9/10/24)
+Master (10/8/24)
 
+* Support and docs for opts.gpu_spreadinterponly=1 for MRI "density compensation
+  estimation" type 1&2 use-case with upsampfac=1.0 PR564 (Chaithya G R).
 * reduced roundoff error in a[n] phase calc in CPU onedim_fseries_kernel().
-   #534 (Barnett).
+   PR534 (Barnett).
 * GPU code type 1,2 also reduced round-off error in phases, to match CPU code;
   rationalized onedim_{fseries,nuft}_* GPU codes to match CPU (Barbone, Barnett)
 * Added type 3 in 1D, 2D, and 3D, in the GPU library cufinufft. PR #517, Barbone
@@ -14,9 +16,9 @@ Master (9/10/24)
   - Minor fixes on the GPU code:
     a) removed memory leaks in case of errors
     b) renamed maxbatchsize to batchsize
-* Add options for user-provided FFTW locker (PR548, Blackwell). These options can be be
-used to prevent crashes when a user is creating/destroying FFTW plans and
-FINUFFT plans in threads simultaneously.
+* Add options for user-provided FFTW locker (PR548, Blackwell). These options
+  can be be used to prevent crashes when a user is creating/destroying FFTW
+  plans and FINUFFT plans in threads simultaneously.
 
 V 2.3.0 (9/5/24)
 
diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst
index a5f050220..e67a9c533 100644
--- a/docs/c_gpu.rst
+++ b/docs/c_gpu.rst
@@ -1,3 +1,5 @@
+.. _c_gpu:
+
 C interface (GPU)
 =================
 
@@ -311,8 +313,9 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove
 
 **gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented]
 
-**gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and kernel FT deconvolution steps and return garbage answers.
-Nonzero value can be used *only* with `upsampfac=1` and `kerevalmeth=1` for using cufinufft for only spread and interpolate mode. This has applications into estimating the density compensation, which is conventionally used in MRI. (See [MRI-NUFFT](https://mind-inria.github.io/mri-nufft/nufft.html))
+**gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and deconvolution (diagonal division by kernel Fourier transform) steps, which returns *garbage answers as a NUFFT*, but allows advanced users to perform an isolated spreading or interpolation using the usual type 1 or type 2 ``cufinufft`` interface. To do this, the nonzero flag value must be used *only* with ``upsampfac=1.0`` (since the input and output grids are the same size, and neither represents Fourier coefficients), and ``kerevalmeth=1``. The known use-case here is estimating so-called density compensation, conventionally used in MRI. (See [MRI-NUFFT](https://mind-inria.github.io/mri-nufft/nufft.html)) Please note that this flag is also internally used by type 3 transforms (which was its original use case).
+
+
 
 
 Algorithm performance options
@@ -323,7 +326,7 @@ Algorithm performance options
 * ``gpu_method=0`` : makes an automatic choice of one of the below methods, based on our heuristics.
 
 * ``gpu_method=1`` : uses a nonuniform points-driven method, either unsorted which is referred to as GM in our paper, or sorted which is called GM-sort in our paper, depending on option ``gpu_sort`` below
-  
+
 * ``gpu_method=2`` : for spreading only, ie, type 1 transforms, uses a shared memory output-block driven method, referred to as SM in our paper. Has no effect for interpolation (type 2 transforms)
 
 * ``gpu_method>2`` : (various upsupported experimental methods due to Melody Shih, not for regular users. Eg ``3`` tests an idea of Paul Springer's to group NU points when spreading, ``4`` is a block gather method of possible interest.)
diff --git a/docs/error.rst b/docs/error.rst
index 6e12d2859..447fda939 100644
--- a/docs/error.rst
+++ b/docs/error.rst
@@ -29,7 +29,7 @@ has the following meanings (see codes in ``include/finufft_errors.h``):
   18 size of bins for subprob/blockgather invalid
   19 GPU shmem too small for subprob/blockgather parameters
   20 invalid number of nonuniform points: nj or nk negative, or too big (see defs.h)
-  23 invalid upsampfac set while using gpu_spreadinterponly mode.
+  23 invalid upsampfac set while using gpu_spreadinterponly mode
 
 When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable.
 
diff --git a/docs/opts.rst b/docs/opts.rst
index d659b8b20..5a1defe84 100644
--- a/docs/opts.rst
+++ b/docs/opts.rst
@@ -160,16 +160,18 @@ There is thus little reason for the nonexpert to mess with this option.
 
 **upsampfac**: This is the internal real factor by which the FFT (fine grid)
 is chosen larger than
-the number of requested modes in each dimension, for type 1 and 2 transforms.
+the number of requested modes in each dimension, for type 1 and 2 transforms. It must be greater than 1.
 We have built efficient kernels
-for only two settings, as follows. Otherwise, setting it to zero chooses a good heuristic:
+for only two settings that are greater than 1, as follows. Otherwise, setting it to zero chooses a good heuristic:
 
-* ``upsampfac=0.0`` : use heuristics to choose ``upsampfac`` as one of the below values, and use this value internally. The value chosen is visible in the text output via setting ``debug>=2``. This setting is recommended for basic users; however, if you seek more performance it is quick to try the other of the below.
+* ``upsampfac=0.0`` : use heuristics to choose ``upsampfac`` as one of the below values exceeding 1, and use this value internally. The value chosen is visible in the text output via setting ``debug>=2``. This setting is recommended for basic users; however, if you seek more performance it is quick to try the other of the values exceeding 1.
 
-* ``upsampfac=2.0`` : standard setting of upsampling. This is necessary if you need to exceed 9 digits of accuracy.
+* ``upsampfac=2.0`` : standard setting of upsampling. Due to kernel width restrictions, this is necessary if you need to exceed 9 digits of accuracy.
 
 * ``upsampfac=1.25`` : low-upsampling option, with lower RAM, smaller FFTs, but wider spreading kernel. The latter can be much faster than the standard when the number of nonuniform points is similar or smaller to the number of modes, and/or if low accuracy is required. It is especially much (2 to 3 times) faster for type 3 transforms. However, the kernel widths :math:`w` are about 50% larger in each dimension, which can lead to slower spreading (it can also be faster due to the smaller size of the fine grid). Because the kernel width is limited to 16, currently, thus only 9-digit accuracy can currently be reached when using ``upsampfac=1.25``.
 
+* ``upsampfac=1.0`` : an obscure advanced setting peculiar to a "spread/interpolate only" mode specific to the GPU (see :ref:`GPU usage<c_gpu>`), which does not even in fact execute a NUFFT. Thus, do not use this unless you know what you are doing!
+
 **spread_thread**: in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), controls how multithreading is used to spread/interpolate each batch of data.
 
 * ``spread_thread=0`` : makes an automatic choice between the below. Recommended.

From c7d8ac41de49e2d2ee37d4c5d058748f649f6a54 Mon Sep 17 00:00:00 2001
From: ahbarnett <alex.h.barnett@gmail.com>
Date: Tue, 8 Oct 2024 15:14:41 -0400
Subject: [PATCH 24/32] docs: fix URL in c_gpu, and opts explain upsampfac
 choices better

---
 docs/c_gpu.rst           |   2 +-
 docs/opts.rst            |   7 +-
 include/cufinufft/impl.h | 147 ++++++++++++++++++++-------------------
 3 files changed, 79 insertions(+), 77 deletions(-)

diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst
index e67a9c533..966426607 100644
--- a/docs/c_gpu.rst
+++ b/docs/c_gpu.rst
@@ -313,7 +313,7 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove
 
 **gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented]
 
-**gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and deconvolution (diagonal division by kernel Fourier transform) steps, which returns *garbage answers as a NUFFT*, but allows advanced users to perform an isolated spreading or interpolation using the usual type 1 or type 2 ``cufinufft`` interface. To do this, the nonzero flag value must be used *only* with ``upsampfac=1.0`` (since the input and output grids are the same size, and neither represents Fourier coefficients), and ``kerevalmeth=1``. The known use-case here is estimating so-called density compensation, conventionally used in MRI. (See [MRI-NUFFT](https://mind-inria.github.io/mri-nufft/nufft.html)) Please note that this flag is also internally used by type 3 transforms (which was its original use case).
+**gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and deconvolution (diagonal division by kernel Fourier transform) steps, which returns *garbage answers as a NUFFT*, but allows advanced users to perform an isolated spreading or interpolation using the usual type 1 or type 2 ``cufinufft`` interface. To do this, the nonzero flag value must be used *only* with ``upsampfac=1.0`` (since the input and output grids are the same size, and neither represents Fourier coefficients), and ``kerevalmeth=1``. The known use-case here is estimating so-called density compensation, conventionally used in MRI (see `MRI-NUFFT <https://mind-inria.github.io/mri-nufft/nufft.html>`_). Please note that this flag is also internally used by type 3 transforms (being its original use case).
 
 
 
diff --git a/docs/opts.rst b/docs/opts.rst
index 5a1defe84..2fc86b22b 100644
--- a/docs/opts.rst
+++ b/docs/opts.rst
@@ -54,10 +54,11 @@ Here are their default settings (from ``src/finufft.cpp:finufft_default_opts``):
 
 As for quick advice, the main options you'll want to play with are:
 
+- ``upsampfac`` to trade-off between spread/interpolate vs FFT speed and RAM
 - ``modeord`` to flip ("fftshift") the Fourier mode ordering
 - ``debug`` to look at timing output (to determine if your problem is spread/interpolation dominated, vs FFT dominated)
 - ``nthreads`` to run with a different number of threads than the current maximum available through OpenMP (a large number can sometimes be detrimental, and very small problems can sometimes run faster on 1 thread)
-- ``fftw`` to try slower plan modes which give faster transforms. The next natural one to try is ``FFTW_MEASURE`` (look at the FFTW3 docs)
+- ``fftw`` to try slower FFTW plan modes which give faster transforms. The next natural one to try is ``FFTW_MEASURE`` (look at the FFTW3 docs)
 
 See :ref:`Troubleshooting <trouble>` for good advice on trying options, and read the full options descriptions below.
 
@@ -158,9 +159,9 @@ There is thus little reason for the nonexpert to mess with this option.
 * ``spread_kerpad=1`` : pad to next multiple of four
 
 
-**upsampfac**: This is the internal real factor by which the FFT (fine grid)
+**upsampfac**: This is the internal factor, greater than 1, by which the FFT (fine grid)
 is chosen larger than
-the number of requested modes in each dimension, for type 1 and 2 transforms. It must be greater than 1.
+the number of requested modes in each dimension, for type 1 and 2 transforms. For type 3 transforms this factor gets squared, due to type 2 nested in a type-1-style spreading operation, so has even more effect.
 We have built efficient kernels
 for only two settings that are greater than 1, as follows. Otherwise, setting it to zero chooses a good heuristic:
 
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 0ec67c818..012050b15 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -108,9 +108,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     d_plan->mt = nmodes[1];
     d_plan->mu = nmodes[2];
     if (d_plan->opts.debug) {
-        printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
+      printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
     }
-  } else {
+  } else { // type 3 turns its outer type 1 into spreading-only
     d_plan->opts.gpu_spreadinterponly = 1;
   }
 
@@ -157,9 +157,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   }
   if (d_plan->opts.gpu_spreadinterponly) {
     // XNOR implementation below with boolean logic.
-    if ((d_plan->opts.upsampfac !=1) == (type != 3)) {
-        ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
-        goto finalize;
+    if ((d_plan->opts.upsampfac != 1) == (type != 3)) {
+      ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
+      goto finalize;
     }
   }
   /* Setup Spreader */
@@ -262,74 +262,75 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     } break;
     }
 
-    // We dont need any cuFFT plans or kernel values if we are only spreading / interpolating
-    if(!d_plan->opts.gpu_spreadinterponly) {
-        cufftHandle fftplan;
-        cufftResult_t cufft_status;
-        switch (d_plan->dim) {
-        case 1: {
-          int n[]       = {(int)nf1};
-          int inembed[] = {(int)nf1};
-
-          cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
-                                       inembed[0], cufft_type<T>(), batchsize);
-        } break;
-        case 2: {
-          int n[]       = {(int)nf2, (int)nf1};
-          int inembed[] = {(int)nf2, (int)nf1};
-
-          cufft_status =
-              cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
-                            inembed[0] * inembed[1], cufft_type<T>(), batchsize);
-        } break;
-        case 3: {
-          int n[]       = {(int)nf3, (int)nf2, (int)nf1};
-          int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
-
-          cufft_status = cufftPlanMany(
-              &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
-              inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
-        } break;
-        }
+    // We dont need any cuFFT plans or kernel values if we are only spreading /
+    // interpolating
+    if (!d_plan->opts.gpu_spreadinterponly) {
+      cufftHandle fftplan;
+      cufftResult_t cufft_status;
+      switch (d_plan->dim) {
+      case 1: {
+        int n[]       = {(int)nf1};
+        int inembed[] = {(int)nf1};
+
+        cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
+                                     inembed[0], cufft_type<T>(), batchsize);
+      } break;
+      case 2: {
+        int n[]       = {(int)nf2, (int)nf1};
+        int inembed[] = {(int)nf2, (int)nf1};
+
+        cufft_status =
+            cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
+                          inembed[0] * inembed[1], cufft_type<T>(), batchsize);
+      } break;
+      case 3: {
+        int n[]       = {(int)nf3, (int)nf2, (int)nf1};
+        int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
+
+        cufft_status = cufftPlanMany(
+            &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
+            inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
+      } break;
+      }
 
-        if (cufft_status != CUFFT_SUCCESS) {
-          fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
-                  cufftGetErrorString(cufft_status));
-          ier = FINUFFT_ERR_CUDA_FAILURE;
-          goto finalize;
-        }
-        cufftSetStream(fftplan, stream);
-
-        d_plan->fftplan = fftplan;
-
-        // compute up to 3 * NQUAD precomputed values on CPU
-        T fseries_precomp_phase[3 * MAX_NQUAD];
-        T fseries_precomp_f[3 * MAX_NQUAD];
-        thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
-        thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
-        onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
-                                         fseries_precomp_phase, d_plan->spopts);
-        if (d_plan->dim > 1)
-          onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
-                                           fseries_precomp_phase + MAX_NQUAD, d_plan->spopts);
-        if (d_plan->dim > 2)
-          onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
-                                           fseries_precomp_phase + 2 * MAX_NQUAD,
-                                           d_plan->spopts);
-        // copy the precomputed data to the device using thrust
-        thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
-                     d_fseries_precomp_phase.begin());
-        thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
-                     d_fseries_precomp_f.begin());
-        // the full fseries is done on the GPU here
-        if ((ier = fseries_kernel_compute(
-                 d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
-                 d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
-                 d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
-                 d_plan->spopts.nspread, stream)))
-          goto finalize;
+      if (cufft_status != CUFFT_SUCCESS) {
+        fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
+                cufftGetErrorString(cufft_status));
+        ier = FINUFFT_ERR_CUDA_FAILURE;
+        goto finalize;
+      }
+      cufftSetStream(fftplan, stream);
+
+      d_plan->fftplan = fftplan;
+
+      // compute up to 3 * NQUAD precomputed values on CPU
+      T fseries_precomp_phase[3 * MAX_NQUAD];
+      T fseries_precomp_f[3 * MAX_NQUAD];
+      thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
+      thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
+      onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
+                                       fseries_precomp_phase, d_plan->spopts);
+      if (d_plan->dim > 1)
+        onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
+                                         fseries_precomp_phase + MAX_NQUAD,
+                                         d_plan->spopts);
+      if (d_plan->dim > 2)
+        onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
+                                         fseries_precomp_phase + 2 * MAX_NQUAD,
+                                         d_plan->spopts);
+      // copy the precomputed data to the device using thrust
+      thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
+                   d_fseries_precomp_phase.begin());
+      thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
+                   d_fseries_precomp_f.begin());
+      // the full fseries is done on the GPU here
+      if ((ier = fseries_kernel_compute(
+               d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
+               d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
+               d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
+               d_plan->spopts.nspread, stream)))
+        goto finalize;
     }
-    
   }
 finalize:
   if (ier > 1) {
@@ -760,8 +761,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
           thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N,
           d_plan->deconv, d_plan->deconv,
           [c1, c2, c3, d1, d2, d3, realsign] __host__ __device__(
-              const thrust::tuple<T, T, T> tuple,
-              cuda_complex<T> deconv) -> cuda_complex<T> {
+              const thrust::tuple<T, T, T> tuple, cuda_complex<T> deconv)
+              -> cuda_complex<T> {
             // d2 and d3 are 0 if dim < 2 and dim < 3
             const auto phase = c1 * (thrust::get<0>(tuple) + d1) +
                                c2 * (thrust::get<1>(tuple) + d2) +

From 6e6d04f29bb6c230d896ce8326212b0503d58b38 Mon Sep 17 00:00:00 2001
From: ahbarnett <alex.h.barnett@gmail.com>
Date: Tue, 8 Oct 2024 21:37:45 -0400
Subject: [PATCH 25/32] fix gpu_spreadinterponly docs since I'd forgot that
 cufinufft_opts are distinct from finufft_opts. It is only part of the former.

---
 docs/c_gpu.rst | 6 ++++--
 docs/opts.rst  | 9 ++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst
index 966426607..7a1b25ff7 100644
--- a/docs/c_gpu.rst
+++ b/docs/c_gpu.rst
@@ -291,6 +291,8 @@ This deallocates all arrays inside the ``plan`` struct, freeing all internal mem
 Note: the plan (being just a pointer to the plan struct) is not actually "destroyed"; rather, its internal struct is destroyed.
 There is no need for further deallocation of the plan.
 
+.. _opts_gpu:
+
 Options for GPU code
 --------------------
 
@@ -313,7 +315,7 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove
 
 **gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented]
 
-**gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and deconvolution (diagonal division by kernel Fourier transform) steps, which returns *garbage answers as a NUFFT*, but allows advanced users to perform an isolated spreading or interpolation using the usual type 1 or type 2 ``cufinufft`` interface. To do this, the nonzero flag value must be used *only* with ``upsampfac=1.0`` (since the input and output grids are the same size, and neither represents Fourier coefficients), and ``kerevalmeth=1``. The known use-case here is estimating so-called density compensation, conventionally used in MRI (see `MRI-NUFFT <https://mind-inria.github.io/mri-nufft/nufft.html>`_). Please note that this flag is also internally used by type 3 transforms (being its original use case).
+**gpu_spreadinterponly**: If ``0`` do the NUFFT as intended. If ``1``, omit the FFT and deconvolution (diagonal division by kernel Fourier transform) steps, which returns *garbage answers as a NUFFT*, but allows advanced users to perform an isolated spreading or interpolation using the usual type 1 or type 2 ``cufinufft`` interface. To do this, the nonzero flag value must be used *only* with ``upsampfac=1.0`` (since no upsampling takes place), and ``kerevalmeth=1``. The known use-case here is estimating so-called density compensation, conventionally used in MRI (see `MRI-NUFFT <https://mind-inria.github.io/mri-nufft/nufft.html>`_), although it might also be useful in spectral Ewald. Please note that this flag is also internally used by type 3 transforms (although it was originally a debug flag).
 
 
 
@@ -335,7 +337,7 @@ Algorithm performance options
 
 **gpu_kerevalmeth**: ``0`` use direct (reference) kernel evaluation, which is not recommended for speed (however, it allows nonstandard ``opts.upsampfac`` to be used). ``1`` use Horner piecewise polynomial evaluation (recommended, and enforces ``upsampfac=2.0``)
 
-**upsampfac**: set upsampling factor. For the recommended ``kerevalmeth=1`` you must choose the standard ``upsampfac=2.0``. If you are willing to risk a slower kernel evaluation, you may set any ``upsampfac>1.0``, but this is experimental and unsupported.
+**upsampfac**: set upsampling factor. For the recommended ``kerevalmeth=1`` you must choose the standard ``upsampfac=2.0``. If you are willing to risk a slower kernel evaluation, you may set any ``upsampfac>1.0``, but this is experimental and unsupported. Finally, ``upsampfac=1.0`` is an advanced GPU setting only to be paired with the "spread/interpolate only" mode triggered by setting ``gpu_spreadinterponly=1`` (see options above); do not use this unless you know what you are doing!
 
 **gpu_maxsubprobsize**: maximum number of NU points to be handled in a single subproblem in the spreading SM method (``gpu_method=2`` only)
 
diff --git a/docs/opts.rst b/docs/opts.rst
index 2fc86b22b..66a5b5a5c 100644
--- a/docs/opts.rst
+++ b/docs/opts.rst
@@ -159,19 +159,18 @@ There is thus little reason for the nonexpert to mess with this option.
 * ``spread_kerpad=1`` : pad to next multiple of four
 
 
-**upsampfac**: This is the internal factor, greater than 1, by which the FFT (fine grid)
+**upsampfac**: This is the internal factor by which the FFT (fine grid)
 is chosen larger than
-the number of requested modes in each dimension, for type 1 and 2 transforms. For type 3 transforms this factor gets squared, due to type 2 nested in a type-1-style spreading operation, so has even more effect.
+the number of requested modes in each dimension, for type 1 and 2 transforms. For type 3 transforms this factor gets squared, due to type 2 nested in a type-1-spreading operation, so has even more influence.
 We have built efficient kernels
-for only two settings that are greater than 1, as follows. Otherwise, setting it to zero chooses a good heuristic:
+for only two settings, as follows. Otherwise, setting it to zero chooses a good heuristic:
 
-* ``upsampfac=0.0`` : use heuristics to choose ``upsampfac`` as one of the below values exceeding 1, and use this value internally. The value chosen is visible in the text output via setting ``debug>=2``. This setting is recommended for basic users; however, if you seek more performance it is quick to try the other of the values exceeding 1.
+* ``upsampfac=0.0`` : use heuristics to choose ``upsampfac`` as one of the below values, and use this value internally. The value chosen is visible in the text output via setting ``debug>=2``. This setting is recommended for basic users; however, if you seek more performance it is quick to try the other of the values.
 
 * ``upsampfac=2.0`` : standard setting of upsampling. Due to kernel width restrictions, this is necessary if you need to exceed 9 digits of accuracy.
 
 * ``upsampfac=1.25`` : low-upsampling option, with lower RAM, smaller FFTs, but wider spreading kernel. The latter can be much faster than the standard when the number of nonuniform points is similar or smaller to the number of modes, and/or if low accuracy is required. It is especially much (2 to 3 times) faster for type 3 transforms. However, the kernel widths :math:`w` are about 50% larger in each dimension, which can lead to slower spreading (it can also be faster due to the smaller size of the fine grid). Because the kernel width is limited to 16, currently, thus only 9-digit accuracy can currently be reached when using ``upsampfac=1.25``.
 
-* ``upsampfac=1.0`` : an obscure advanced setting peculiar to a "spread/interpolate only" mode specific to the GPU (see :ref:`GPU usage<c_gpu>`), which does not even in fact execute a NUFFT. Thus, do not use this unless you know what you are doing!
 
 **spread_thread**: in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), controls how multithreading is used to spread/interpolate each batch of data.
 

From cfe1e330c14229e4bf5ed05ddb6cff6ee1545b1c Mon Sep 17 00:00:00 2001
From: ahbarnett <alex.h.barnett@gmail.com>
Date: Sun, 13 Oct 2024 23:01:31 -0400
Subject: [PATCH 26/32] fix include/fortran.fh crash when changing opts due to
 Robert's new fftw lock fields of opts

---
 include/finufft.fh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/finufft.fh b/include/finufft.fh
index e68b82611..ce1ce95b8 100644
--- a/include/finufft.fh
+++ b/include/finufft.fh
@@ -8,4 +8,5 @@ c     Also see ../fortran/finufftfort.cpp.  Barnett 5/29/20. One prec 7/2/20.
          real*8 upsampfac
          integer spread_thread,maxbatchsize,showwarn,nthreads,
      $        spread_nthr_atomic,spread_max_sp_size
+         integer fftw_lock_fun,fftw_unlock_fun,fftw_lock_data
       end type

From 9374600a9a9260a23124744b8ef39ee9bf4cbd8d Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 18 Oct 2024 17:36:16 +0200
Subject: [PATCH 27/32] remove finufft.cpp; remove blanket use of namspace std

---
 src/finufft.cpp      | 38 ------------------------------
 src/finufft_core.cpp | 55 +++++++++++++++++++++++---------------------
 2 files changed, 29 insertions(+), 64 deletions(-)
 delete mode 100644 src/finufft.cpp

diff --git a/src/finufft.cpp b/src/finufft.cpp
deleted file mode 100644
index 758fcb723..000000000
--- a/src/finufft.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// public header
-#include <finufft.h>
-
-// private headers for lib build
-// (must come after finufft.h which clobbers FINUFFT* macros)
-#include <finufft/defs.h>
-
-void FINUFFT_DEFAULT_OPTS(finufft_opts *o) { finufft_default_opts_t(o); }
-
-int FINUFFT_MAKEPLAN(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
-                     FLT tol, FINUFFT_PLAN *pp, finufft_opts *opts) {
-  return finufft_makeplan_t<FLT>(type, dim, n_modes, iflag, ntrans, tol,
-                                 reinterpret_cast<FINUFFT_PLAN_T<FLT> **>(pp), opts);
-}
-
-int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT nk,
-                   FLT *s, FLT *t, FLT *u) {
-  return finufft_setpts_t<FLT>(reinterpret_cast<FINUFFT_PLAN_T<FLT> *>(p), nj, xj, yj, zj,
-                               nk, s, t, u);
-}
-
-int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) {
-  return finufft_execute_t<FLT>(reinterpret_cast<FINUFFT_PLAN_T<FLT> *>(p), cj, fk);
-}
-
-int FINUFFT_DESTROY(FINUFFT_PLAN p)
-// Free everything we allocated inside of finufft_plan pointed to by p.
-// Also must not crash if called immediately after finufft_makeplan.
-// Thus either each thing free'd here is guaranteed to be nullptr or correctly
-// allocated.
-{
-  if (!p) // nullptr, so not a ptr to a plan, report error
-    return 1;
-
-  delete reinterpret_cast<FINUFFT_PLAN_T<FLT> *>(p);
-  p = nullptr;
-  return 0; // success
-}
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
index 834420bbe..60f62f1e5 100644
--- a/src/finufft_core.cpp
+++ b/src/finufft_core.cpp
@@ -12,7 +12,6 @@
 #include <memory>
 #include <vector>
 
-using namespace std;
 using namespace finufft;
 using namespace finufft::utils;
 using namespace finufft::spreadinterp;
@@ -150,12 +149,12 @@ static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopt
       Xsafe = 1.0;
       Ssafe = 1.0;
     } else
-      Xsafe = max(Xsafe, 1 / S);
+      Xsafe = std::max(Xsafe, 1 / S);
   else
-    Ssafe = max(Ssafe, 1 / X);
+    Ssafe = std::max(Ssafe, 1 / X);
   // use the safe X and S...
   auto nfd = T(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss);
-  if (!isfinite(nfd)) nfd = 0.0; // use T to catch inf
+  if (!std::isfinite(nfd)) nfd = 0.0; // use T to catch inf
   *nf = (BIGINT)nfd;
   // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
   //  catch too small nf, and nan or +-inf, otherwise spread fails...
@@ -209,10 +208,10 @@ static void onedim_fseries_kernel(BIGINT nf, std::vector<T> &fwkerhalf,
     a[n] = -exp(2 * PI * std::complex<double>(0, 1) * z[n] / double(nf)); // phase winding
                                                                           // rates
   }
-  BIGINT nout = nf / 2 + 1;                       // how many values we're writing to
-  int nt      = min(nout, (BIGINT)opts.nthreads); // how many chunks
-  std::vector<BIGINT> brk(nt + 1);                // start indices for each thread
-  for (int t = 0; t <= nt; ++t)                   // split nout mode indices btw threads
+  BIGINT nout = nf / 2 + 1;                            // how many values we're writing to
+  int nt      = std::min(nout, (BIGINT)opts.nthreads); // how many chunks
+  std::vector<BIGINT> brk(nt + 1);                     // start indices for each thread
+  for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads
     brk[t] = (BIGINT)(0.5 + nout * t / (double)nt);
 #pragma omp parallel num_threads(nt)
   {                                                // each thread gets own chunk to do
@@ -618,7 +617,7 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int
     p->nbatch    = 1 + (ntrans - 1) / nthr;         // min # batches poss
     p->batchSize = 1 + (ntrans - 1) / p->nbatch;    // then cut # thr in each b
   } else {                                          // batchSize override by user
-    p->batchSize = min(p->opts.maxbatchsize, ntrans);
+    p->batchSize = std::min(p->opts.maxbatchsize, ntrans);
     p->nbatch    = 1 + (ntrans - 1) / p->batchSize; // resulting # batches
   }
   if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice
@@ -907,8 +906,8 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
         TF phase = t3P.D1 * xj[j];
         if (d > 1) phase += t3P.D2 * yj[j];
         if (d > 2) phase += t3P.D3 * zj[j];
-        prephase[j] = cos(phase) + imasign * sin(phase); // Euler
-                                                         // e^{+-i.phase}
+        prephase[j] = std::cos(phase) + imasign * std::sin(phase); // Euler
+                                                                   // e^{+-i.phase}
       }
     } else
       for (BIGINT j = 0; j < nj; ++j)
@@ -939,12 +938,16 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
       phiHatk3.resize(nk);
       onedim_nuft_kernel(nk, Up, phiHatk3, spopts); // fill phiHat3
     }
-    int Cfinite = isfinite(t3P.C1) && isfinite(t3P.C2) && isfinite(t3P.C3); // C can be
-                                                                            // nan or inf
-                                                                            // if M=0, no
-                                                                            // input NU
-                                                                            // pts
-    int Cnonzero = t3P.C1 != 0.0 || t3P.C2 != 0.0 || t3P.C3 != 0.0;         // cen
+    int Cfinite =
+        std::isfinite(t3P.C1) && std::isfinite(t3P.C2) && std::isfinite(t3P.C3); // C can
+                                                                                 // be nan
+                                                                                 // or inf
+                                                                                 // if
+                                                                                 // M=0,
+                                                                                 // no
+                                                                                 // input
+                                                                                 // NU pts
+    int Cnonzero = t3P.C1 != 0.0 || t3P.C2 != 0.0 || t3P.C3 != 0.0;              // cen
 #pragma omp parallel for num_threads(opts.nthreads) schedule(static)
     for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs
       TF phiHat = phiHatk1[k];
@@ -955,7 +958,7 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
         TF phase = (s[k] - t3P.D1) * t3P.C1;
         if (d > 1) phase += (t[k] - t3P.D2) * t3P.C2;
         if (d > 2) phase += (u[k] - t3P.D3) * t3P.C3;
-        deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase}
+        deconv[k] *= std::cos(phase) + imasign * std::sin(phase); // Euler e^{+-i.phase}
       }
     }
     if (opts.debug)
@@ -971,12 +974,12 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF
 
     // Plan and setpts once, for the (repeated) inner type 2 finufft call...
     timer.restart();
-    BIGINT t2nmodes[]   = {nf1, nf2, nf3};        // t2 input is actually fw
-    finufft_opts t2opts = opts;                   // deep copy, since not ptrs
-    t2opts.modeord      = 0;                      // needed for correct t3!
-    t2opts.debug        = max(0, opts.debug - 1); // don't print as much detail
-    t2opts.spread_debug = max(0, opts.spread_debug - 1);
-    t2opts.showwarn     = 0;                      // so don't see warnings 2x
+    BIGINT t2nmodes[]   = {nf1, nf2, nf3};             // t2 input is actually fw
+    finufft_opts t2opts = opts;                        // deep copy, since not ptrs
+    t2opts.modeord      = 0;                           // needed for correct t3!
+    t2opts.debug        = std::max(0, opts.debug - 1); // don't print as much detail
+    t2opts.spread_debug = std::max(0, opts.spread_debug - 1);
+    t2opts.showwarn     = 0;                           // so don't see warnings 2x
     // (...could vary other t2opts here?)
     if (innerT2plan) {
       delete innerT2plan;
@@ -1049,7 +1052,7 @@ int FINUFFT_PLAN_T<TF>::execute(std::complex<TF> *cj, std::complex<TF> *fk) {
     for (int b = 0; b * batchSize < ntrans; b++) { // .....loop b over batches
 
       // current batch is either batchSize, or possibly truncated if last one
-      int thisBatchSize     = min(ntrans - b * batchSize, batchSize);
+      int thisBatchSize     = std::min(ntrans - b * batchSize, batchSize);
       int bB                = b * batchSize; // index of vector, since batchsizes same
       std::complex<TF> *cjb = cj + bB * nj;  // point to batch of weights
       std::complex<TF> *fkb = fk + bB * N;   // point to batch of mode coeffs
@@ -1110,7 +1113,7 @@ int FINUFFT_PLAN_T<TF>::execute(std::complex<TF> *cj, std::complex<TF> *fk) {
     for (int b = 0; b * batchSize < ntrans; b++) { // .....loop b over batches
 
       // batching and pointers to this batch, identical to t1,2 above...
-      int thisBatchSize     = min(ntrans - b * batchSize, batchSize);
+      int thisBatchSize     = std::min(ntrans - b * batchSize, batchSize);
       int bB                = b * batchSize;
       std::complex<TF> *cjb = cj + bB * nj; // batch of input strengths
       std::complex<TF> *fkb = fk + bB * nk; // batch of output strengths

From 622855cfcbefa05597436ea93ec005a8cf637649 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 18 Oct 2024 20:57:17 +0200
Subject: [PATCH 28/32] add explanation for explicit template instantiations

---
 src/finufft_core.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
index 60f62f1e5..c2e32852a 100644
--- a/src/finufft_core.cpp
+++ b/src/finufft_core.cpp
@@ -758,6 +758,14 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int
   }
   return ier; // report setup_spreader status (could be warning)
 }
+
+// For this function and the following ones (i.e. everything that is accessible
+// from outside), we need to state for which data types we want the template
+// to be instantiated. At the current location in the code, the compiler knows
+// how exactly it can construct the function "finufft_makeplan_t" for any given
+// type TF, but it doesn't know for which types it actually should do so.
+// The following two statements instruct it to do that for TF=float and
+// TF=double.
 template int finufft_makeplan_t<float>(int type, int dim, const BIGINT *n_modes,
                                        int iflag, int ntrans, float tol,
                                        FINUFFT_PLAN_T<float> **pp, finufft_opts *opts);

From 8238568a03dbc5c5b9815d091780ec422016b9f5 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 18 Oct 2024 21:02:53 +0200
Subject: [PATCH 29/32] shorten simpleinterfaces

---
 src/simpleinterfaces.cpp | 311 ++++++++++++++++++---------------------
 1 file changed, 140 insertions(+), 171 deletions(-)

diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp
index 4b3630d93..d2606ee6b 100644
--- a/src/simpleinterfaces.cpp
+++ b/src/simpleinterfaces.cpp
@@ -20,37 +20,42 @@ using namespace std;
    ---------------------------------------------------------------------------
 */
 
+using f32  = float;
+using f64  = double;
+using c64  = std::complex<float>;
+using c128 = std::complex<double>;
+using i64  = BIGINT;
+
 void finufft_default_opts(finufft_opts *o) { finufft_default_opts_t(o); }
 void finufftf_default_opts(finufft_opts *o) { finufft_default_opts_t(o); }
 
-int finufft_makeplan(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
-                     double tol, finufft_plan *pp, finufft_opts *opts) {
-  return finufft_makeplan_t<double>(type, dim, n_modes, iflag, ntrans, tol,
-                                    reinterpret_cast<FINUFFT_PLAN_T<double> **>(pp),
-                                    opts);
+int finufft_makeplan(int type, int dim, const i64 *n_modes, int iflag, int ntrans,
+                     f64 tol, finufft_plan *pp, finufft_opts *opts) {
+  return finufft_makeplan_t<f64>(type, dim, n_modes, iflag, ntrans, tol,
+                                 reinterpret_cast<FINUFFT_PLAN_T<f64> **>(pp), opts);
 }
-int finufftf_makeplan(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
-                      float tol, finufftf_plan *pp, finufft_opts *opts) {
-  return finufft_makeplan_t<float>(type, dim, n_modes, iflag, ntrans, tol,
-                                   reinterpret_cast<FINUFFT_PLAN_T<float> **>(pp), opts);
+int finufftf_makeplan(int type, int dim, const i64 *n_modes, int iflag, int ntrans,
+                      f32 tol, finufftf_plan *pp, finufft_opts *opts) {
+  return finufft_makeplan_t<f32>(type, dim, n_modes, iflag, ntrans, tol,
+                                 reinterpret_cast<FINUFFT_PLAN_T<f32> **>(pp), opts);
 }
 
-int finufft_setpts(finufft_plan p, BIGINT nj, double *xj, double *yj, double *zj,
-                   BIGINT nk, double *s, double *t, double *u) {
-  return finufft_setpts_t<double>(reinterpret_cast<FINUFFT_PLAN_T<double> *>(p), nj, xj,
-                                  yj, zj, nk, s, t, u);
+int finufft_setpts(finufft_plan p, i64 nj, f64 *xj, f64 *yj, f64 *zj, i64 nk, f64 *s,
+                   f64 *t, f64 *u) {
+  return finufft_setpts_t<f64>(reinterpret_cast<FINUFFT_PLAN_T<f64> *>(p), nj, xj, yj, zj,
+                               nk, s, t, u);
 }
-int finufftf_setpts(finufftf_plan p, BIGINT nj, float *xj, float *yj, float *zj,
-                    BIGINT nk, float *s, float *t, float *u) {
-  return finufft_setpts_t<float>(reinterpret_cast<FINUFFT_PLAN_T<float> *>(p), nj, xj, yj,
-                                 zj, nk, s, t, u);
+int finufftf_setpts(finufftf_plan p, i64 nj, f32 *xj, f32 *yj, f32 *zj, i64 nk, f32 *s,
+                    f32 *t, f32 *u) {
+  return finufft_setpts_t<f32>(reinterpret_cast<FINUFFT_PLAN_T<f32> *>(p), nj, xj, yj, zj,
+                               nk, s, t, u);
 }
 
-int finufft_execute(finufft_plan p, std::complex<double> *cj, std::complex<double> *fk) {
-  return finufft_execute_t<double>(reinterpret_cast<FINUFFT_PLAN_T<double> *>(p), cj, fk);
+int finufft_execute(finufft_plan p, c128 *cj, c128 *fk) {
+  return finufft_execute_t<f64>(reinterpret_cast<FINUFFT_PLAN_T<f64> *>(p), cj, fk);
 }
-int finufftf_execute(finufftf_plan p, std::complex<float> *cj, std::complex<float> *fk) {
-  return finufft_execute_t<float>(reinterpret_cast<FINUFFT_PLAN_T<float> *>(p), cj, fk);
+int finufftf_execute(finufftf_plan p, c64 *cj, c64 *fk) {
+  return finufft_execute_t<f32>(reinterpret_cast<FINUFFT_PLAN_T<f32> *>(p), cj, fk);
 }
 
 int finufft_destroy(finufft_plan p)
@@ -62,7 +67,7 @@ int finufft_destroy(finufft_plan p)
   if (!p) // nullptr ptr, so not a ptr to a plan, report error
     return 1;
 
-  delete reinterpret_cast<FINUFFT_PLAN_T<double> *>(p);
+  delete reinterpret_cast<FINUFFT_PLAN_T<f64> *>(p);
   p = nullptr;
   return 0; // success
 }
@@ -75,7 +80,7 @@ int finufftf_destroy(finufftf_plan p)
   if (!p) // nullptr ptr, so not a ptr to a plan, report error
     return 1;
 
-  delete reinterpret_cast<FINUFFT_PLAN_T<float> *>(p);
+  delete reinterpret_cast<FINUFFT_PLAN_T<f32> *>(p);
   p = nullptr;
   return 0; // success
 }
@@ -85,10 +90,10 @@ namespace finufft {
 namespace common {
 
 template<typename T>
-static int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, T *xj,
-                               T *yj, T *zj, std::complex<T> *cj, int iflag, T eps,
-                               const std::array<BIGINT, 3> &n_modes, BIGINT nk, T *s,
-                               T *t, T *u, std::complex<T> *fk, finufft_opts *popts)
+static int guruCall(int n_dims, int type, int n_transf, i64 nj, T *xj, T *yj, T *zj,
+                    std::complex<T> *cj, int iflag, T eps,
+                    const std::array<i64, 3> &n_modes, i64 nk, T *s, T *t, T *u,
+                    std::complex<T> *fk, finufft_opts *popts)
 // Helper layer between simple interfaces (with opts) and the guru functions.
 // Author: Andrea Malleo, 2019.
 {
@@ -127,94 +132,84 @@ using namespace finufft::common;
 
 // Dimension 1111111111111111111111111111111111111111111111111111111111111111
 
-int finufft1d1many(int n_transf, BIGINT nj, double *xj, std::complex<double> *cj,
-                   int iflag, double eps, BIGINT ms, std::complex<double> *fk,
-                   finufft_opts *opts)
+int finufft1d1many(int n_transf, i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms,
+                   c128 *fk, finufft_opts *opts)
 // Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst
 {
-  return invokeGuruInterface<double>(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
-                                     eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk,
-                                     opts);
+  return guruCall<f64>(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
+                       {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
 }
-int finufftf1d1many(int n_transf, BIGINT nj, float *xj, std::complex<float> *cj,
-                    int iflag, float eps, BIGINT ms, std::complex<float> *fk,
-                    finufft_opts *opts)
+int finufftf1d1many(int n_transf, i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms,
+                    c64 *fk, finufft_opts *opts)
 // Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst
 {
-  return invokeGuruInterface<float>(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
-                                    eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk,
-                                    opts);
+  return guruCall<f32>(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
+                       {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
 }
 
-int finufft1d1(BIGINT nj, double *xj, std::complex<double> *cj, int iflag, double eps,
-               BIGINT ms, std::complex<double> *fk, finufft_opts *opts)
+int finufft1d1(i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms, c128 *fk,
+               finufft_opts *opts)
 //  Type-1 1D complex nonuniform FFT. See ../docs/usage.rst
 {
   return finufft1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
 }
-int finufftf1d1(BIGINT nj, float *xj, std::complex<float> *cj, int iflag, float eps,
-                BIGINT ms, std::complex<float> *fk, finufft_opts *opts)
+int finufftf1d1(i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms, c64 *fk,
+                finufft_opts *opts)
 //  Type-1 1D complex nonuniform FFT. See ../docs/usage.rst
 {
   return finufftf1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
 }
 
-int finufft1d2many(int n_transf, BIGINT nj, double *xj, std::complex<double> *cj,
-                   int iflag, double eps, BIGINT ms, std::complex<double> *fk,
-                   finufft_opts *opts)
+int finufft1d2many(int n_transf, i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms,
+                   c128 *fk, finufft_opts *opts)
 //  Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  return invokeGuruInterface<double>(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
-                                     eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk,
-                                     opts);
+  return guruCall<f64>(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
+                       {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
 }
-int finufftf1d2many(int n_transf, BIGINT nj, float *xj, std::complex<float> *cj,
-                    int iflag, float eps, BIGINT ms, std::complex<float> *fk,
-                    finufft_opts *opts)
+int finufftf1d2many(int n_transf, i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms,
+                    c64 *fk, finufft_opts *opts)
 //  Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  return invokeGuruInterface<float>(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
-                                    eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk,
-                                    opts);
+  return guruCall<f32>(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
+                       {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
 }
 
-int finufft1d2(BIGINT nj, double *xj, std::complex<double> *cj, int iflag, double eps,
-               BIGINT ms, std::complex<double> *fk, finufft_opts *opts)
+int finufft1d2(i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms, c128 *fk,
+               finufft_opts *opts)
 //  Type-2 1D complex nonuniform FFT. See ../docs/usage.rst
 {
   return finufft1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
 }
-int finufftf1d2(BIGINT nj, float *xj, std::complex<float> *cj, int iflag, float eps,
-                BIGINT ms, std::complex<float> *fk, finufft_opts *opts)
+int finufftf1d2(i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms, c64 *fk,
+                finufft_opts *opts)
 //  Type-2 1D complex nonuniform FFT. See ../docs/usage.rst
 {
   return finufftf1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
 }
 
-int finufft1d3many(int n_transf, BIGINT nj, double *xj, std::complex<double> *cj,
-                   int iflag, double eps, BIGINT nk, double *s, std::complex<double> *fk,
-                   finufft_opts *opts)
+int finufft1d3many(int n_transf, i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 nk,
+                   f64 *s, c128 *fk, finufft_opts *opts)
 // Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  return invokeGuruInterface<double>(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
-                                     eps, {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts);
+  return guruCall<f64>(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
+                       {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts);
 }
-int finufftf1d3many(int n_transf, BIGINT nj, float *xj, std::complex<float> *cj,
-                    int iflag, float eps, BIGINT nk, float *s, std::complex<float> *fk,
-                    finufft_opts *opts)
+int finufftf1d3many(int n_transf, i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 nk,
+                    f32 *s, c64 *fk, finufft_opts *opts)
 // Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  return invokeGuruInterface<float>(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
-                                    eps, {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts);
+  return guruCall<f32>(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps,
+                       {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts);
 }
-int finufft1d3(BIGINT nj, double *xj, std::complex<double> *cj, int iflag, double eps,
-               BIGINT nk, double *s, std::complex<double> *fk, finufft_opts *opts)
+int finufft1d3(i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 nk, f64 *s, c128 *fk,
+               finufft_opts *opts)
 // Type-3 1D complex nonuniform FFT. See ../docs/usage.rst
 {
   return finufft1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts);
 }
-int finufftf1d3(BIGINT nj, float *xj, std::complex<float> *cj, int iflag, float eps,
-                BIGINT nk, float *s, std::complex<float> *fk, finufft_opts *opts)
+int finufftf1d3(i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 nk, f32 *s, c64 *fk,
+                finufft_opts *opts)
 // Type-3 1D complex nonuniform FFT. See ../docs/usage.rst
 {
   return finufftf1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts);
@@ -222,94 +217,82 @@ int finufftf1d3(BIGINT nj, float *xj, std::complex<float> *cj, int iflag, float
 
 // Dimension 22222222222222222222222222222222222222222222222222222222222222222
 
-int finufft2d1many(int n_transf, BIGINT nj, double *xj, double *yj,
-                   std::complex<double> *c, int iflag, double eps, BIGINT ms, BIGINT mt,
-                   std::complex<double> *fk, finufft_opts *opts)
+int finufft2d1many(int n_transf, i64 nj, f64 *xj, f64 *yj, c128 *c, int iflag, f64 eps,
+                   i64 ms, i64 mt, c128 *fk, finufft_opts *opts)
 //  Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  return invokeGuruInterface<double>(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps,
-                                     {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
+  return guruCall<f64>(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0,
+                       nullptr, nullptr, nullptr, fk, opts);
 }
-int finufftf2d1many(int n_transf, BIGINT nj, float *xj, float *yj, std::complex<float> *c,
-                    int iflag, float eps, BIGINT ms, BIGINT mt, std::complex<float> *fk,
-                    finufft_opts *opts)
+int finufftf2d1many(int n_transf, i64 nj, f32 *xj, f32 *yj, c64 *c, int iflag, f32 eps,
+                    i64 ms, i64 mt, c64 *fk, finufft_opts *opts)
 //  Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  return invokeGuruInterface<float>(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps,
-                                    {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
+  return guruCall<f32>(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0,
+                       nullptr, nullptr, nullptr, fk, opts);
 }
-int finufft2d1(BIGINT nj, double *xj, double *yj, std::complex<double> *cj, int iflag,
-               double eps, BIGINT ms, BIGINT mt, std::complex<double> *fk,
-               finufft_opts *opts)
+int finufft2d1(i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps, i64 ms, i64 mt,
+               c128 *fk, finufft_opts *opts)
 //  Type-1 2D complex nonuniform FFT. See ../docs/usage.rst
 {
   return finufft2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
 }
-int finufftf2d1(BIGINT nj, float *xj, float *yj, std::complex<float> *cj, int iflag,
-                float eps, BIGINT ms, BIGINT mt, std::complex<float> *fk,
-                finufft_opts *opts)
+int finufftf2d1(i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps, i64 ms, i64 mt,
+                c64 *fk, finufft_opts *opts)
 //  Type-1 2D complex nonuniform FFT. See ../docs/usage.rst
 {
   return finufftf2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
 }
 
-int finufft2d2many(int n_transf, BIGINT nj, double *xj, double *yj,
-                   std::complex<double> *c, int iflag, double eps, BIGINT ms, BIGINT mt,
-                   std::complex<double> *fk, finufft_opts *opts)
+int finufft2d2many(int n_transf, i64 nj, f64 *xj, f64 *yj, c128 *c, int iflag, f64 eps,
+                   i64 ms, i64 mt, c128 *fk, finufft_opts *opts)
 //  Type-2 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  return invokeGuruInterface<double>(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps,
-                                     {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
+  return guruCall<f64>(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0,
+                       nullptr, nullptr, nullptr, fk, opts);
 }
-int finufftf2d2many(int n_transf, BIGINT nj, float *xj, float *yj, std::complex<float> *c,
-                    int iflag, float eps, BIGINT ms, BIGINT mt, std::complex<float> *fk,
-                    finufft_opts *opts)
+int finufftf2d2many(int n_transf, i64 nj, f32 *xj, f32 *yj, c64 *c, int iflag, f32 eps,
+                    i64 ms, i64 mt, c64 *fk, finufft_opts *opts)
 //  Type-2 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  return invokeGuruInterface<float>(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps,
-                                    {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
+  return guruCall<f32>(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0,
+                       nullptr, nullptr, nullptr, fk, opts);
 }
-int finufft2d2(BIGINT nj, double *xj, double *yj, std::complex<double> *cj, int iflag,
-               double eps, BIGINT ms, BIGINT mt, std::complex<double> *fk,
-               finufft_opts *opts)
+int finufft2d2(i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps, i64 ms, i64 mt,
+               c128 *fk, finufft_opts *opts)
 //  Type-2 2D complex nonuniform FFT.  See ../docs/usage.rst
 {
   return finufft2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
 }
-int finufftf2d2(BIGINT nj, float *xj, float *yj, std::complex<float> *cj, int iflag,
-                float eps, BIGINT ms, BIGINT mt, std::complex<float> *fk,
-                finufft_opts *opts)
+int finufftf2d2(i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps, i64 ms, i64 mt,
+                c64 *fk, finufft_opts *opts)
 //  Type-2 2D complex nonuniform FFT.  See ../docs/usage.rst
 {
   return finufftf2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
 }
 
-int finufft2d3many(int n_transf, BIGINT nj, double *xj, double *yj,
-                   std::complex<double> *cj, int iflag, double eps, BIGINT nk, double *s,
-                   double *t, std::complex<double> *fk, finufft_opts *opts)
+int finufft2d3many(int n_transf, i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps,
+                   i64 nk, f64 *s, f64 *t, c128 *fk, finufft_opts *opts)
 // Type-3 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  return invokeGuruInterface<double>(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps,
-                                     {0, 0, 0}, nk, s, t, nullptr, fk, opts);
+  return guruCall<f64>(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps, {0, 0, 0}, nk,
+                       s, t, nullptr, fk, opts);
 }
-int finufftf2d3many(int n_transf, BIGINT nj, float *xj, float *yj,
-                    std::complex<float> *cj, int iflag, float eps, BIGINT nk, float *s,
-                    float *t, std::complex<float> *fk, finufft_opts *opts)
+int finufftf2d3many(int n_transf, i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps,
+                    i64 nk, f32 *s, f32 *t, c64 *fk, finufft_opts *opts)
 // Type-3 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  return invokeGuruInterface<float>(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps,
-                                    {0, 0, 0}, nk, s, t, nullptr, fk, opts);
+  return guruCall<f32>(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps, {0, 0, 0}, nk,
+                       s, t, nullptr, fk, opts);
 }
-int finufft2d3(BIGINT nj, double *xj, double *yj, std::complex<double> *cj, int iflag,
-               double eps, BIGINT nk, double *s, double *t, std::complex<double> *fk,
-               finufft_opts *opts)
+int finufft2d3(i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps, i64 nk, f64 *s,
+               f64 *t, c128 *fk, finufft_opts *opts)
 // Type-3 2D complex nonuniform FFT.  See ../docs/usage.rst
 {
   return finufft2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts);
 }
-int finufftf2d3(BIGINT nj, float *xj, float *yj, std::complex<float> *cj, int iflag,
-                float eps, BIGINT nk, float *s, float *t, std::complex<float> *fk,
-                finufft_opts *opts)
+int finufftf2d3(i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps, i64 nk, f32 *s,
+                f32 *t, c64 *fk, finufft_opts *opts)
 // Type-3 2D complex nonuniform FFT.  See ../docs/usage.rst
 {
   return finufftf2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts);
@@ -317,96 +300,82 @@ int finufftf2d3(BIGINT nj, float *xj, float *yj, std::complex<float> *cj, int if
 
 // Dimension 3333333333333333333333333333333333333333333333333333333333333333
 
-int finufft3d1many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj,
-                   std::complex<double> *cj, int iflag, double eps, BIGINT ms, BIGINT mt,
-                   BIGINT mu, std::complex<double> *fk, finufft_opts *opts)
+int finufft3d1many(int n_transf, i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag,
+                   f64 eps, i64 ms, i64 mt, i64 mu, c128 *fk, finufft_opts *opts)
 // Type-1 3D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  return invokeGuruInterface<double>(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                     {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk,
-                                     opts);
+  return guruCall<f64>(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0,
+                       nullptr, nullptr, nullptr, fk, opts);
 }
-int finufftf3d1many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj,
-                    std::complex<float> *cj, int iflag, float eps, BIGINT ms, BIGINT mt,
-                    BIGINT mu, std::complex<float> *fk, finufft_opts *opts)
+int finufftf3d1many(int n_transf, i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag,
+                    f32 eps, i64 ms, i64 mt, i64 mu, c64 *fk, finufft_opts *opts)
 // Type-1 3D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  return invokeGuruInterface<float>(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                    {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, opts);
+  return guruCall<f32>(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0,
+                       nullptr, nullptr, nullptr, fk, opts);
 }
-int finufft3d1(BIGINT nj, double *xj, double *yj, double *zj, std::complex<double> *cj,
-               int iflag, double eps, BIGINT ms, BIGINT mt, BIGINT mu,
-               std::complex<double> *fk, finufft_opts *opts)
+int finufft3d1(i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, f64 eps, i64 ms,
+               i64 mt, i64 mu, c128 *fk, finufft_opts *opts)
 //  Type-1 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
   return finufft3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
 }
-int finufftf3d1(BIGINT nj, float *xj, float *yj, float *zj, std::complex<float> *cj,
-                int iflag, float eps, BIGINT ms, BIGINT mt, BIGINT mu,
-                std::complex<float> *fk, finufft_opts *opts)
+int finufftf3d1(i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, f32 eps, i64 ms,
+                i64 mt, i64 mu, c64 *fk, finufft_opts *opts)
 //  Type-1 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
   return finufftf3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
 }
 
-int finufft3d2many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj,
-                   std::complex<double> *cj, int iflag, double eps, BIGINT ms, BIGINT mt,
-                   BIGINT mu, std::complex<double> *fk, finufft_opts *opts)
+int finufft3d2many(int n_transf, i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag,
+                   f64 eps, i64 ms, i64 mt, i64 mu, c128 *fk, finufft_opts *opts)
 // Type-2 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
 {
-  return invokeGuruInterface<double>(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                     {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk,
-                                     opts);
+  return guruCall<f64>(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0,
+                       nullptr, nullptr, nullptr, fk, opts);
 }
-int finufftf3d2many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj,
-                    std::complex<float> *cj, int iflag, float eps, BIGINT ms, BIGINT mt,
-                    BIGINT mu, std::complex<float> *fk, finufft_opts *opts)
+int finufftf3d2many(int n_transf, i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag,
+                    f32 eps, i64 ms, i64 mt, i64 mu, c64 *fk, finufft_opts *opts)
 // Type-2 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
 {
-  return invokeGuruInterface<float>(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                    {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, opts);
+  return guruCall<f32>(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0,
+                       nullptr, nullptr, nullptr, fk, opts);
 }
-int finufft3d2(BIGINT nj, double *xj, double *yj, double *zj, std::complex<double> *cj,
-               int iflag, double eps, BIGINT ms, BIGINT mt, BIGINT mu,
-               std::complex<double> *fk, finufft_opts *opts)
+int finufft3d2(i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, f64 eps, i64 ms,
+               i64 mt, i64 mu, c128 *fk, finufft_opts *opts)
 // Type-2 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
   return finufft3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
 }
-int finufftf3d2(BIGINT nj, float *xj, float *yj, float *zj, std::complex<float> *cj,
-                int iflag, float eps, BIGINT ms, BIGINT mt, BIGINT mu,
-                std::complex<float> *fk, finufft_opts *opts)
+int finufftf3d2(i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, f32 eps, i64 ms,
+                i64 mt, i64 mu, c64 *fk, finufft_opts *opts)
 // Type-2 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
   return finufftf3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
 }
 
-int finufft3d3many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj,
-                   std::complex<double> *cj, int iflag, double eps, BIGINT nk, double *s,
-                   double *t, double *u, std::complex<double> *fk, finufft_opts *opts)
+int finufft3d3many(int n_transf, i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag,
+                   f64 eps, i64 nk, f64 *s, f64 *t, f64 *u, c128 *fk, finufft_opts *opts)
 //  Type-3 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
 {
-  return invokeGuruInterface<double>(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                     {0, 0, 0}, nk, s, t, u, fk, opts);
+  return guruCall<f64>(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, {0, 0, 0}, nk, s,
+                       t, u, fk, opts);
 }
-int finufftf3d3many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj,
-                    std::complex<float> *cj, int iflag, float eps, BIGINT nk, float *s,
-                    float *t, float *u, std::complex<float> *fk, finufft_opts *opts)
+int finufftf3d3many(int n_transf, i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag,
+                    f32 eps, i64 nk, f32 *s, f32 *t, f32 *u, c64 *fk, finufft_opts *opts)
 //  Type-3 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
 {
-  return invokeGuruInterface<float>(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                    {0, 0, 0}, nk, s, t, u, fk, opts);
+  return guruCall<f32>(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, {0, 0, 0}, nk, s,
+                       t, u, fk, opts);
 }
-int finufft3d3(BIGINT nj, double *xj, double *yj, double *zj, std::complex<double> *cj,
-               int iflag, double eps, BIGINT nk, double *s, double *t, double *u,
-               std::complex<double> *fk, finufft_opts *opts)
+int finufft3d3(i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, f64 eps, i64 nk,
+               f64 *s, f64 *t, f64 *u, c128 *fk, finufft_opts *opts)
 //  Type-3 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
   return finufft3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts);
 }
-int finufftf3d3(BIGINT nj, float *xj, float *yj, float *zj, std::complex<float> *cj,
-                int iflag, float eps, BIGINT nk, float *s, float *t, float *u,
-                std::complex<float> *fk, finufft_opts *opts)
+int finufftf3d3(i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, f32 eps, i64 nk,
+                f32 *s, f32 *t, f32 *u, c64 *fk, finufft_opts *opts)
 //  Type-3 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
   return finufftf3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts);

From ca140632eb96701b23e7c616130287e1f72a36d3 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Fri, 18 Oct 2024 21:13:31 +0200
Subject: [PATCH 30/32] pass options by reference

---
 src/finufft_core.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
index c2e32852a..b5ce38068 100644
--- a/src/finufft_core.cpp
+++ b/src/finufft_core.cpp
@@ -80,8 +80,8 @@ namespace common {
 
 static constexpr double PI = 3.14159265358979329;
 
-static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts,
-                         BIGINT *nf)
+static int set_nf_type12(BIGINT ms, const finufft_opts &opts,
+                         const finufft_spread_opts &spopts, BIGINT *nf)
 // Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts
 // and requested number of Fourier modes ms. Returns 0 if success, else an
 // error code if nf was unreasonably big (& tell the world).
@@ -101,8 +101,8 @@ static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopt
 }
 
 template<typename T>
-static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, finufft_opts opts,
-                                    int dim)
+static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps,
+                                    const finufft_opts &opts, int dim)
 // Set up the spreader parameters given eps, and pass across various nufft
 // options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17
 {
@@ -127,8 +127,8 @@ static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, finufft_
 }
 
 template<typename T>
-static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopts,
-                          BIGINT *nf, T *h, T *gam)
+static void set_nhg_type3(T S, T X, const finufft_opts &opts,
+                          const finufft_spread_opts &spopts, BIGINT *nf, T *h, T *gam)
 /* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor),
    for type 3 only.
    Inputs:
@@ -167,7 +167,7 @@ static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopt
 
 template<typename T>
 static void onedim_fseries_kernel(BIGINT nf, std::vector<T> &fwkerhalf,
-                                  finufft_spread_opts opts)
+                                  const finufft_spread_opts &opts)
 /*
   Approximates exact Fourier series coeffs of cnufftspread's real symmetric
   kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
@@ -232,7 +232,7 @@ static void onedim_fseries_kernel(BIGINT nf, std::vector<T> &fwkerhalf,
 
 template<typename T>
 static void onedim_nuft_kernel(BIGINT nk, const std::vector<T> &k, std::vector<T> &phihat,
-                               finufft_spread_opts opts)
+                               const finufft_spread_opts &opts)
 /*
   Approximates exact 1D Fourier transform of cnufftspread's real symmetric
   kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting

From 74ef403bebd64b0bded50a343d5aeb75e7a5aa91 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Sat, 19 Oct 2024 10:04:23 +0200
Subject: [PATCH 31/32] address review comments

---
 include/finufft/defs.h |  2 --
 makefile               | 34 ++++++++++++++++------------------
 src/finufft_core.cpp   |  7 ++++---
 src/spreadinterp.cpp   |  2 +-
 4 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/include/finufft/defs.h b/include/finufft/defs.h
index 084ffa41c..633a7c1f3 100644
--- a/include/finufft/defs.h
+++ b/include/finufft/defs.h
@@ -26,8 +26,6 @@
 
 // All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
 // This includes all calling arguments (eg M,N) that could be huge someday.
-// using BIGINT  = int64_t;
-// using UBIGINT = uint64_t;
 // Precision-independent real and complex types, for private lib/test compile
 #ifdef SINGLE
 using FLT = float;
diff --git a/makefile b/makefile
index 4c07f522e..c9754a376 100644
--- a/makefile
+++ b/makefile
@@ -133,13 +133,11 @@ STATICLIB = lib-static/$(LIBNAME).a
 # absolute path to the .so, useful for linking so executables portable...
 ABSDYNLIB = $(FINUFFT)$(DYNLIB)
 
-# spreader dual-precision objs
-SOBJSD = src/utils.o src/spreadinterp.o
+# spreader objs
+SOBJS = src/utils.o src/spreadinterp.o
 
-# precision-independent library object files (compiled & linked only once)...
-OBJS_PI = $(SOBJSD) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o src/simpleinterfaces.o fortran/finufftfort.o
 # all lib dual-precision objs (note DUCC_OBJS empty if unused)
-OBJSD = $(OBJS_PI) $(DUCC_OBJS)
+OBJS = $(SOBJS) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o src/simpleinterfaces.o fortran/finufftfort.o $(DUCC_OBJS)
 
 .PHONY: usage lib examples test perftest spreadtest spreadtestall fortran matlab octave all mex python clean objclean pyclean mexclean wheel docker-wheel gurutime docs setup setupclean
 
@@ -199,17 +197,17 @@ src/spreadinterp.o: $(SHEAD)
 # lib -----------------------------------------------------------------------
 # build library with double/single prec both bundled in...
 lib: $(STATICLIB) $(DYNLIB)
-$(STATICLIB): $(OBJSD)
-	ar rcs $(STATICLIB) $(OBJSD)
+$(STATICLIB): $(OBJS)
+	ar rcs $(STATICLIB) $(OBJS)
 ifeq ($(OMP),OFF)
 	@echo "$(STATICLIB) built, single-thread version"
 else
 	@echo "$(STATICLIB) built, multithreaded version"
 endif
-$(DYNLIB): $(OBJSD)
+$(DYNLIB): $(OBJS)
 # using *absolute* path in the -o here is needed to make portable executables
 # when compiled against it, in mac OSX, strangely...
-	$(CXX) -shared ${LDFLAGS} $(OMPFLAGS) $(OBJSD) -o $(ABSDYNLIB) $(LIBSFFT)
+	$(CXX) -shared ${LDFLAGS} $(OMPFLAGS) $(OBJS) -o $(ABSDYNLIB) $(LIBSFFT)
 ifeq ($(OMP),OFF)
 	@echo "$(DYNLIB) built, single-thread version"
 else
@@ -313,14 +311,14 @@ ST=perftest/spreadtestnd
 STA=perftest/spreadtestndall
 STF=$(ST)f
 STAF=$(STA)f
-$(ST): $(ST).cpp $(SOBJSD)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJSD) $(LIBS) -o $@
-$(STF): $(ST).cpp $(SOBJSD)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSD) $(LIBS) -o $@
-$(STA): $(STA).cpp $(SOBJSD)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJSD) $(LIBS) -o $@
-$(STAF): $(STA).cpp $(SOBJSD)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSD) $(LIBS) -o $@
+$(ST): $(ST).cpp $(SOBJS)
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJS) $(LIBS) -o $@
+$(STF): $(ST).cpp $(SOBJS)
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJS) $(LIBS) -o $@
+$(STA): $(STA).cpp $(SOBJS)
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJS) $(LIBS) -o $@
+$(STAF): $(STA).cpp $(SOBJS)
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJS) $(LIBS) -o $@
 spreadtest: $(ST) $(STF)
 # run one thread per core... (escape the $ to get single $ in bash; one big cmd)
 	(export OMP_NUM_THREADS=$$(perftest/mynumcores.sh) ;\
@@ -424,7 +422,7 @@ endif
 
 # python ---------------------------------------------------------------------
 python: $(STATICLIB) $(DYNLIB)
-	FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install --break-system-packages python/finufft
+	FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install python/finufft
 # note to devs: if trouble w/ NumPy, use: pip install ./python --no-deps
 	$(PYTHON) python/finufft/test/run_accuracy_tests.py
 	$(PYTHON) python/finufft/examples/simple1d1.py
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
index b5ce38068..e950e2160 100644
--- a/src/finufft_core.cpp
+++ b/src/finufft_core.cpp
@@ -205,8 +205,9 @@ static void onedim_fseries_kernel(BIGINT nf, std::vector<T> &fwkerhalf,
   for (int n = 0; n < q; ++n) {      // set up nodes z_n and vals f_n
     z[n] *= J2;                      // rescale nodes
     f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei
-    a[n] = -exp(2 * PI * std::complex<double>(0, 1) * z[n] / double(nf)); // phase winding
-                                                                          // rates
+    a[n] = -std::exp(2 * PI * std::complex<double>(0, 1) * z[n] / double(nf)); // phase
+                                                                               // winding
+                                                                               // rates
   }
   BIGINT nout = nf / 2 + 1;                            // how many values we're writing to
   int nt      = std::min(nout, (BIGINT)opts.nthreads); // how many chunks
@@ -218,7 +219,7 @@ static void onedim_fseries_kernel(BIGINT nf, std::vector<T> &fwkerhalf,
     int t = MY_OMP_GET_THREAD_NUM();
     std::complex<T> aj[MAX_NQUAD];                 // phase rotator for this thread
     for (int n = 0; n < q; ++n)
-      aj[n] = pow(a[n], (T)brk[t]);                // init phase factors for chunk
+      aj[n] = std::pow(a[n], (T)brk[t]);           // init phase factors for chunk
     for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array
       T x = 0.0;                                   // accumulator for answer at this j
       for (int n = 0; n < q; ++n) {
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 7c7309de2..191f7e0bc 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -225,7 +225,7 @@ static FINUFFT_ALWAYS_INLINE T fold_rescale(const T x, const UBIGINT N) noexcept
 
 template<typename T, class simd_type>
 static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x,
-                                                    const BIGINT N) noexcept {
+                                                    const UBIGINT N) noexcept {
   const simd_type x2pi   = T(M_1_2PI);
   const simd_type result = xsimd::fma(x, x2pi, simd_type(0.5));
   return (result - xsimd::floor(result)) * simd_type(T(N));

From 60611ebd8a4891120a6dec24c5747e41015bab90 Mon Sep 17 00:00:00 2001
From: Martin Reinecke <martin@mpa-garching.mpg.de>
Date: Sat, 19 Oct 2024 10:11:48 +0200
Subject: [PATCH 32/32] revert BIGINT->UBIGINT change, seems to have bigger
 ramifications

---
 src/spreadinterp.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 191f7e0bc..7c7309de2 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -225,7 +225,7 @@ static FINUFFT_ALWAYS_INLINE T fold_rescale(const T x, const UBIGINT N) noexcept
 
 template<typename T, class simd_type>
 static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x,
-                                                    const UBIGINT N) noexcept {
+                                                    const BIGINT N) noexcept {
   const simd_type x2pi   = T(M_1_2PI);
   const simd_type result = xsimd::fma(x, x2pi, simd_type(0.5));
   return (result - xsimd::floor(result)) * simd_type(T(N));