Skip to content

Commit

Permalink
massive debugging petapm.c, almost works
Browse files Browse the repository at this point in the history
  • Loading branch information
nianyic7 authored and sbird committed Nov 1, 2024
1 parent 488dec6 commit f23d950
Showing 1 changed file with 103 additions and 38 deletions.
141 changes: 103 additions & 38 deletions libgadget/petapm.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,17 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_

int ThisTask;
int NTask;
pm->Mesh2Task[0] = (int *) mymalloc2("Mesh2Task", 2*sizeof(int) * Nmesh);
pm->Mesh2Task[1] = pm->Mesh2Task[0] + Nmesh;
MPI_Comm_rank(comm, &ThisTask);
MPI_Comm_size(comm, &NTask);

int ndevices;
cudaGetDeviceCount(&ndevices);
cudaSetDevice(ThisTask % ndevices);

message(0, "Cuda Devices %d \n", ndevices);

/* try to find a square 2d decomposition */
/* CUDA NOTE: CufftMp only supports square decomposition,
so Ntask has to be a perfect square*/
Expand All @@ -126,6 +131,7 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
}

message(0, "Using 2D Task mesh %td x %td \n", nranks1d, nranks1d);

// Define custom data distribution
int64 nx = Nmesh;
int64 ny = Nmesh;
Expand All @@ -134,45 +140,54 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
int64 nz_complex = (nz/2+1);
int64 nz_real_padded = 2*nz_complex;

// Describe the data distribution using boxes
auto make_box = [](int64 lower[3], int64 upper[3], int64 strides[3]) {
Box3D box;
for(int i = 0; i < 3; i++) {
box.lower[i] = lower[i];
box.upper[i] = upper[i];
box.strides[i] = strides[i];
}
return box;
};
// create 2D cartesian MPI comm without pfft
int dims[2] = {nranks1d, nranks1d};
int periods[2] = {0, 0}; // non-periodic in both dimensions
// Allow the ranks to be reordered by MPI for efficiency
// Actually don't allow reordering for now to be safe
int reorder = 0;
MPI_Cart_create(comm, 2, dims, periods, reorder, &pm->priv->comm_cart_2d);
if (pm->priv->comm_cart_2d == MPI_COMM_NULL) {
endrun(0, "Error: comm_cart_2d is MPI_COMM_NULL\n");
}
MPI_Cart_get(pm->priv->comm_cart_2d, 2, pm->NTask2d, periods, pm->ThisTask2d);
message(1, "Task = %d ThisTask2d = (%d, %d) Ntask2d = (%d, %d) \n",
ThisTask, pm->ThisTask2d[0], pm->ThisTask2d[1], pm->NTask2d[0], pm->NTask2d[1]);


// compute offset, size and strides
auto displacement = [](int64 length, int rank, int size) {
int ranks_cutoff = length % size;
return (rank < ranks_cutoff ? rank * (length / size + 1) : ranks_cutoff * (length / size + 1) + (rank - ranks_cutoff) * (length / size));
int chunk_size = length / size;
return (rank < ranks_cutoff ? rank * (chunk_size + 1) : ranks_cutoff * (chunk_size + 1) + (rank - ranks_cutoff) * chunk_size);
};

// Input data are real pencils in X & Y, along Z
// Strides are packed and in-place (i.e., real is padded)
Box3D box_real;
Box3D box_complex;
int i,j;
i = ThisTask / nranks1d;
j = ThisTask % nranks1d;

int64 lower[3] = {displacement(nx, i, nranks1d), displacement(ny, j, nranks1d), 0};
int64 upper[3] = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
int64 strides[3] = {(upper[1]-lower[1])*nz_real_padded, nz_real_padded, 1};
box_real = make_box(lower, upper, strides);

// Output data are complex pencils in X & Z, along Y (picked arbitrarily)
// Strides are packed
// For best performances, the local dimension in the input (Z, here) and output (Y, here) should be different
// to ensure cuFFTMp will only perform two communication phases.
// If Z was also local in the output, cuFFTMp would perform three communication phases, decreasing performances.
int64 lower_c[3] = {displacement(nx, i, nranks1d), 0, displacement(nz_complex, j, nranks1d)};
int64 upper_c[3] = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
int64 strides_c[3] = {(upper[1]-lower[1])*(upper[2]-lower[2]), (upper[2]-lower[2]), 1};
box_complex = make_box(lower_c, upper_c, strides_c);


// update region properties
auto update_region = [](int64 lower[3], int64 upper[3], int64 strides[3], PetaPMRegion &region) {
for (int i = 0; i < 3; i++) {
region.offset[i] = lower[i];
region.upper[i] = upper[i];
region.size[i] = upper[i] - lower[i];
region.strides[i] = strides[i];
}
};

int i = ThisTask / nranks1d;
int j = ThisTask % nranks1d;

// real region setup
// note the petapm->region has non-padded strides, while cufft takes in padded strides
int64 lower_real[3] = {displacement(nx, i, nranks1d), displacement(ny, j, nranks1d), 0};
int64 upper_real[3] = {displacement(nx, i+1, nranks1d), displacement(ny, j+1, nranks1d), nz_real};
int64 strides_real[3] = {(upper_real[1] - lower_real[1]) * nz_real_padded, nz_real_padded, 1};
int64 strides_real_nopad[3] = {(upper_real[1] - lower_real[1]) * nz_real, nz_real, 1};
update_region(lower_real, upper_real, strides_real_nopad, pm->real_space_region);

// complex region setup
int64 lower_fourier[3] = {displacement(nx, i, nranks1d), 0, displacement(nz_complex, j, nranks1d)};
int64 upper_fourier[3] = {displacement(nx, i+1, nranks1d), ny, displacement(nz_complex, j+1, nranks1d)};
int64 strides_fourier[3] = {(upper_fourier[1] - lower_fourier[1]) * (upper_fourier[2] - lower_fourier[2]), (upper_fourier[2] - lower_fourier[2]), 1};
update_region(lower_fourier, upper_fourier, strides_fourier, pm->fourier_space_region);

//===============================================================================================
cudaStreamCreate(&pm->priv->stream);
Expand All @@ -187,8 +202,8 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
// C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT ans always perform a CUFFT_INVERSE transform
// So, in both, the "input" box should be the real box and the "output" box should be the complex box

cufftXtSetDistribution(pm->priv->plan_forw, 3, box_real.lower, box_real.upper, box_complex.lower, box_complex.upper, box_real.strides, box_complex.strides);
cufftXtSetDistribution(pm->priv->plan_back, 3, box_complex.lower, box_complex.upper, box_real.lower, box_real.upper, box_complex.strides, box_real.strides);
cufftXtSetDistribution(pm->priv->plan_forw, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier);
cufftXtSetDistribution(pm->priv->plan_back, 3, lower_real, upper_real, lower_fourier, upper_fourier, strides_real, strides_fourier);

// Set the stream
cufftSetStream(pm->priv->plan_forw, pm->priv->stream);
Expand All @@ -198,7 +213,56 @@ petapm_init(PetaPM * pm, double BoxSize, double Asmth, int Nmesh, double G, MPI_
size_t workspace;
cufftMakePlan3d(pm->priv->plan_forw, Nmesh, Nmesh, Nmesh, CUFFT_R2C, &workspace);
cufftMakePlan3d(pm->priv->plan_back, Nmesh, Nmesh, Nmesh, CUFFT_C2R, &workspace);

// Allocate GPU memory, copy CPU data to GPU
// Data is initially distributed according to CUFFT_XT_FORMAT_DISTRIBUTED_INPUT, i.e., box_real
cudaLibXtDesc *desc;
cufftXtMalloc(pm->priv->plan_forw, &desc, CUFFT_XT_FORMAT_DISTRIBUTED_INPUT);
pm->priv->fftsize = desc->descriptor->size[0];
//===============================================================================================

message(1, "Task %d NGPUs=%d, pfftsize=%d \n", ThisTask, desc->descriptor->nGPUs, pm->priv->fftsize);
/* now lets fill up the mesh2task arrays */
#if 1
message(1, "Real Region %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
pm->real_space_region.offset[0],
pm->real_space_region.offset[1],
pm->real_space_region.offset[2],
pm->real_space_region.upper[0],
pm->real_space_region.upper[1],
pm->real_space_region.upper[2],
pm->real_space_region.strides[0],
pm->real_space_region.strides[1],
pm->real_space_region.strides[2]);
message(1, "Complex Region %d lower=(%td %td %td) upper=(%td %td %td) strides=(%td %td %td)\n", ThisTask,
pm->fourier_space_region.offset[0],
pm->fourier_space_region.offset[1],
pm->fourier_space_region.offset[2],
pm->fourier_space_region.upper[0],
pm->fourier_space_region.upper[1],
pm->fourier_space_region.upper[2],
pm->fourier_space_region.strides[0],
pm->fourier_space_region.strides[1],
pm->fourier_space_region.strides[2]);
#endif

int * tmp = (int *) mymalloc("tmp", sizeof(int) * Nmesh);
int k;
for(k = 0; k < 2; k ++) {
for(i = 0; i < Nmesh; i ++) {
tmp[i] = 0;
}
for(i = 0; i < pm->real_space_region.size[k]; i ++) {
tmp[i + pm->real_space_region.offset[k]] = pm->ThisTask2d[k];
}
/* which column / row hosts this tile? */
/* FIXME: this is very inefficient */
MPI_Allreduce(tmp, pm->Mesh2Task[k], Nmesh, MPI_INT, MPI_MAX, comm);
for(i = 0; i < Nmesh; i ++) {
message(0, "Mesh2Task[%d][%d] == %d\n", k, i, pm->Mesh2Task[k][i]);
}
}
myfree(tmp);
}

void
Expand Down Expand Up @@ -644,7 +708,7 @@ layout_build_and_exchange_cells_to_fft(
L->BufRecv, L->NcRecv, L->DcRecv, MPI_DOUBLE,
L->comm);

#if 0
#if 1
double massExport = 0;
for(i = 0; i < L->NcExport; i ++) {
massExport += L->BufSend[i];
Expand Down Expand Up @@ -951,3 +1015,4 @@ static int64_t reduce_int64(int64_t input, MPI_Comm comm) {
* iCFT(CFT) = 2pi
*
* **************************8*/

0 comments on commit f23d950

Please sign in to comment.