massive debugging petapm.c, almost works

MP-Gadget · Dec 27, 2024 · 4704c61 · 4704c61
1 parent 3ff7a85
commit 4704c61
Showing 1 changed file with 102 additions and 36 deletions.
diff --git a/libgadget/petapm.c b/libgadget/petapm.c
@@ -110,12 +110,17 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
 
     int ThisTask;
     int NTask;
+    pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
+    pm->Mesh2Task[1] = pm->Mesh2Task[0] + Nmesh;
     MPI_Comm_rank(comm, &ThisTask);
     MPI_Comm_size(comm, &NTask);
+
     int ndevices;
     cudaGetDeviceCount(&ndevices);
     cudaSetDevice(ThisTask % ndevices);
 
+    message(0, "Cuda Devices %d \n", ndevices);
+
     /* try to find a square 2d decomposition */
     /* CUDA NOTE: CufftMp only supports square decomposition,
     so Ntask has to be a perfect square*/
@@ -126,6 +131,7 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     }
 
     message(0, "Using 2D Task mesh %td x %td \n", nranks1d, nranks1d);
+
     // Define custom data distribution
     int64 nx               = Nmesh;
     int64 ny               = Nmesh;
@@ -134,44 +140,54 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     int64 nz_complex       = (nz/2+1);
     int64 nz_real_padded   = 2*nz_complex;
 
-    // Describe the data distribution using boxes
-    auto make_box = [](int64 lower[3], int64 upper[3], int64 strides[3]) {
-        Box3D box;
-        for(int i = 0; i < 3; i++) {
-            box.lower[i] = lower[i];
-            box.upper[i] = upper[i];
-            box.strides[i] = strides[i];
-        }
-        return box;
-    };
+    // create 2D cartesian MPI comm without pfft
+    int dims[2] = {nranks1d, nranks1d};
+    int periods[2] = {0, 0};  // non-periodic in both dimensions
+    // Allow the ranks to be reordered by MPI for efficiency
+    // Actually don't allow reordering for now to be safe
+    int reorder = 0;
+    MPI_Cart_create(comm, 2, dims, periods, reorder, &pm->priv->comm_cart_2d);
+    if (pm->priv->comm_cart_2d == MPI_COMM_NULL) {
+        endrun(0, "Error: comm_cart_2d is MPI_COMM_NULL\n");
+    }
+    MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods, pm->ThisTask2d);
+    message(1, "Task = %d ThisTask2d = (%d, %d) Ntask2d = (%d, %d) \n",
+        ThisTask, pm->ThisTask2d[0], pm->ThisTask2d[1], pm->NTask2d[0], pm->NTask2d[1]);
 
+
+    // compute offset, size and strides
     auto displacement = [](int64 length, int rank, int size) {
         int ranks_cutoff = length % size;
-        return (rank < ranks_cutoff ? rank * (length / size + 1) : ranks_cutoff * (length / size + 1) + (rank - ranks_cutoff) * (length / size));
+        int chunk_size = length / size;
+        return (rank < ranks_cutoff ? rank * (chunk_size + 1) : ranks_cutoff * (chunk_size + 1) + (rank - ranks_cutoff) * chunk_size);
     };
 
-    // Input data are real pencils in X & Y, along Z
-    // Strides are packed and in-place (i.e., real is padded)
-    Box3D box_real;
-    Box3D box_complex;
-    int i,j;
-    i = ThisTask / nranks1d;
-    j = ThisTask % nranks1d;
-
-    int64 lower[3]   = {displacement(nx, i,   nranks1d), displacement(ny, j,   nranks1d), 0};
-    int64 upper[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
-    int64 strides[3] = {(upper[1]-lower[1])*nz_real_padded, nz_real_padded, 1};
-    box_real = make_box(lower, upper, strides);
-
-    // Output data are complex pencils in X & Z, along Y (picked arbitrarily)
-    // Strides are packed
-    // For best performances, the local dimension in the input (Z, here) and output (Y, here) should be different
-    // to ensure cuFFTMp will only perform two communication phases.
-    // If Z was also local in the output, cuFFTMp would perform three communication phases, decreasing performances.
-    int64 lower_c[3]   = {displacement(nx, i,   nranks1d), 0,  displacement(nz_complex, j,   nranks1d)};
-    int64 upper_c[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
-    int64 strides_c[3] = {(upper[1]-lower[1])*(upper[2]-lower[2]), (upper[2]-lower[2]), 1};
-    box_complex = make_box(lower_c, upper_c, strides_c);
+    // update region properties
+    auto update_region = [](int64 lower[3], int64 upper[3], int64 strides[3], PetaPMRegion &region) {
+        for (int i = 0; i < 3; i++) {
+            region.offset[i]  = lower[i];
+            region.upper[i]   = upper[i];
+            region.size[i]    = upper[i] - lower[i];
+            region.strides[i] = strides[i];
+        }
+    };
+
+    int i = ThisTask / nranks1d;
+    int j = ThisTask % nranks1d;
+
+    // real region setup
+    // note the petapm->region has non-padded strides, while cufft takes in padded strides
+    int64 lower_real[3]   = {displacement(nx, i, nranks1d), displacement(ny, j, nranks1d), 0};
+    int64 upper_real[3]   = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
+    int64 strides_real[3] = {(upper_real[1] - lower_real[1]) * nz_real_padded, nz_real_padded, 1};
+    int64 strides_real_nopad[3] = {(upper_real[1] - lower_real[1]) * nz_real, nz_real, 1};
+    update_region(lower_real, upper_real, strides_real_nopad, pm->real_space_region);
+
+    // complex region setup
+    int64 lower_fourier[3]   = {displacement(nx, i, nranks1d), 0, displacement(nz_complex, j, nranks1d)};
+    int64 upper_fourier[3]   = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
+    int64 strides_fourier[3] = {(upper_fourier[1] - lower_fourier[1]) * (upper_fourier[2] - lower_fourier[2]), (upper_fourier[2] - lower_fourier[2]), 1};
+    update_region(lower_fourier, upper_fourier, strides_fourier, pm->fourier_space_region);
 
     //===============================================================================================
     cudaStreamCreate(&pm->priv->stream);
@@ -186,8 +202,8 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     // C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
     // So, in both, the "input" box should be the real box and the "output" box should be the complex box
 
-    cufftXtSetDistribution(pm->priv->plan_forw, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
-    cufftXtSetDistribution(pm->priv->plan_back, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
+    cufftXtSetDistribution(pm->priv->plan_forw, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier);
+    cufftXtSetDistribution(pm->priv->plan_back, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier);
 
     // Set the stream
     cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
@@ -197,7 +213,56 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
     size_t workspace;
     cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
     cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);
+
+    // Allocate GPU memory, copy CPU data to GPU
+    // Data is initially distributed according to CUFFT_XT_FORMAT_DISTRIBUTED_INPUT, i.e., box_real
+    cudaLibXtDesc *desc;
+    cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_DISTRIBUTED_INPUT);
+    pm->priv->fftsize = desc->descriptor->size[0];
     //===============================================================================================
+
+    message(1, "Task %d NGPUs=%d, pfftsize=%d \n", ThisTask, desc->descriptor->nGPUs, pm->priv->fftsize);
+    /* now lets fill up the mesh2task arrays */
+    #if 1
+        message(1, "Real Region %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
+                pm->real_space_region.offset[0],
+                pm->real_space_region.offset[1],
+                pm->real_space_region.offset[2],
+                pm->real_space_region.upper[0],
+                pm->real_space_region.upper[1],
+                pm->real_space_region.upper[2],
+                pm->real_space_region.strides[0],
+                pm->real_space_region.strides[1],
+                pm->real_space_region.strides[2]);
+        message(1, "Complex Region %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
+                pm->fourier_space_region.offset[0],
+                pm->fourier_space_region.offset[1],
+                pm->fourier_space_region.offset[2],
+                pm->fourier_space_region.upper[0],
+                pm->fourier_space_region.upper[1],
+                pm->fourier_space_region.upper[2],
+                pm->fourier_space_region.strides[0],
+                pm->fourier_space_region.strides[1],
+                pm->fourier_space_region.strides[2]);
+    #endif
+
+        int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
+        int k;
+        for(k = 0; k < 2; k ++) {
+            for(i = 0; i < Nmesh; i ++) {
+                tmp[i] = 0;
+            }
+            for(i = 0; i < pm->real_space_region.size[k]; i ++) {
+                tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
+            }
+            /* which column / row hosts this tile? */
+            /* FIXME: this is very inefficient */
+            MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
+            for(i = 0; i < Nmesh; i ++) {
+                message(0, "Mesh2Task[%d][%d] == %d\n", k, i, pm->Mesh2Task[k][i]);
+            }
+        }
+        myfree(tmp);
 }
 
 void
@@ -643,7 +708,7 @@ layout_build_and_exchange_cells_to_fft(
             L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
             L->comm);
 
-#if 0
+#if 1
     double massExport = 0;
     for(i = 0; i < L->NcExport; i ++) {
         massExport += L->BufSend[i];
@@ -950,3 +1015,4 @@ static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
  * iCFT(CFT) = 2pi
  *
  * **************************8*/
+