Skip to content

Commit

Permalink
Tabs --> Spaces
Browse files Browse the repository at this point in the history
  • Loading branch information
AndiH committed Jun 1, 2022
1 parent 4a83027 commit 16505f7
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 59 deletions.
20 changes: 10 additions & 10 deletions 08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,13 +293,13 @@ int main(int argc, char* argv[]) {
NCCL_CALL(ncclGroupEnd());
CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
#else
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
MPI_STATUS_IGNORE));
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
#endif
std::swap(a_new, a);
std::swap(a_new, a);
}
POP_RANGE

Expand All @@ -326,7 +326,7 @@ int main(int argc, char* argv[]) {
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);

launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
compute_stream);

launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm,
Expand All @@ -346,7 +346,7 @@ int main(int argc, char* argv[]) {
const int bottom = (rank + 1) % size;

// Apply periodic boundary conditions
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
// using the nccl communicator and push_stream.
// Remember to use ncclGroupStart() and ncclGroupEnd()
#ifdef SOLUTION
Expand All @@ -358,14 +358,14 @@ int main(int argc, char* argv[]) {
NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream));
NCCL_CALL(ncclGroupEnd());
#else
PUSH_RANGE("MPI", 5)
PUSH_RANGE("MPI", 5)
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
MPI_STATUS_IGNORE));
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
#endif
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
POP_RANGE

if (calculate_norm) {
Expand Down Expand Up @@ -410,13 +410,13 @@ int main(int argc, char* argv[]) {

if (rank == 0 && result_correct) {
if (csv) {
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
#ifdef SOLUTION
printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
#else
printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
#endif
(stop - start), runtime_serial);
(stop - start), runtime_serial);
} else {
printf("Num GPUs: %d.\n", size);
printf(
Expand Down
24 changes: 12 additions & 12 deletions 08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu
Original file line number Diff line number Diff line change
Expand Up @@ -341,25 +341,25 @@ int main(int argc, char* argv[]) {
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);

launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);

launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);

CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));

if (calculate_norm) {
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
compute_stream));
}

//TODO: Replace MPI communication with Host initiated NVSHMEM calls
//TODO: Replace MPI communication with Host initiated NVSHMEM calls
// Apply periodic boundary conditions
#ifdef SOLUTION
PUSH_RANGE("NVSHMEM", 5)
nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
PUSH_RANGE("NVSHMEM", 5)
nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom, push_stream);
#else
PUSH_RANGE("MPI", 5)
Expand All @@ -369,12 +369,12 @@ int main(int argc, char* argv[]) {
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
#endif
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
POP_RANGE

CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0));

//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)
//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)
#ifdef SOLUTION
nvshmemx_barrier_all_on_stream(compute_stream);
#endif
Expand Down Expand Up @@ -421,7 +421,7 @@ int main(int argc, char* argv[]) {
if (csv) {
//TODO: Replace MPI with NVSHMEM for your output
#ifdef SOLUTION
printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
#else
printf("mpi, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
#endif
Expand Down
12 changes: 6 additions & 6 deletions 08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ int main(int argc, char* argv[]) {
NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, compute_stream));
NCCL_CALL(ncclGroupEnd());
CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
std::swap(a_new, a);
std::swap(a_new, a);
}
POP_RANGE

Expand All @@ -308,7 +308,7 @@ int main(int argc, char* argv[]) {
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);

launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
compute_stream);

launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm,
Expand All @@ -328,7 +328,7 @@ int main(int argc, char* argv[]) {
const int bottom = (rank + 1) % size;

// Apply periodic boundary conditions
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
// using the nccl communicator and push_stream.
// Remember to use ncclGroupStart() and ncclGroupEnd()
PUSH_RANGE("NCCL_LAUNCH", 5)
Expand All @@ -338,7 +338,7 @@ int main(int argc, char* argv[]) {
NCCL_CALL(ncclRecv(a_new + (iy_end * nx), nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream));
NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream));
NCCL_CALL(ncclGroupEnd());
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
POP_RANGE

if (calculate_norm) {
Expand Down Expand Up @@ -383,9 +383,9 @@ int main(int argc, char* argv[]) {

if (rank == 0 && result_correct) {
if (csv) {
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
(stop - start), runtime_serial);
(stop - start), runtime_serial);
} else {
printf("Num GPUs: %d.\n", size);
printf(
Expand Down
24 changes: 12 additions & 12 deletions 08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu
Original file line number Diff line number Diff line change
Expand Up @@ -328,31 +328,31 @@ int main(int argc, char* argv[]) {
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);

launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);

launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);

CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));

if (calculate_norm) {
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
compute_stream));
}

//TODO: Replace MPI communication with Host initiated NVSHMEM calls
//TODO: Replace MPI communication with Host initiated NVSHMEM calls
// Apply periodic boundary conditions
PUSH_RANGE("NVSHMEM", 5)
nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
PUSH_RANGE("NVSHMEM", 5)
nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream);
nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom, push_stream);
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
POP_RANGE

CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0));

//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)
//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)
nvshmemx_barrier_all_on_stream(compute_stream);

if (calculate_norm) {
Expand Down Expand Up @@ -396,7 +396,7 @@ int main(int argc, char* argv[]) {
if (rank == 0 && result_correct) {
if (csv) {
//TODO: Replace MPI with NVSHMEM for your output
printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
(stop - start), runtime_serial);
} else {
printf("Num GPUs: %d.\n", size);
Expand Down
20 changes: 10 additions & 10 deletions 08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,12 @@ int main(int argc, char* argv[]) {
// on the compute_stream.
// Remeber that a group of ncclRecv and ncclSend should be within a ncclGroupStart() and ncclGroupEnd()
// Also, Rember to stream synchronize on the compute_stream at the end
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
MPI_STATUS_IGNORE));
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
std::swap(a_new, a);
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
std::swap(a_new, a);
}
POP_RANGE

Expand All @@ -292,7 +292,7 @@ int main(int argc, char* argv[]) {
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);

launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm,
compute_stream);

launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm,
Expand All @@ -312,16 +312,16 @@ int main(int argc, char* argv[]) {
const int bottom = (rank + 1) % size;

// Apply periodic boundary conditions
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
//TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
// using the nccl communicator and push_stream.
// Remember to use ncclGroupStart() and ncclGroupEnd()
PUSH_RANGE("MPI", 5)
PUSH_RANGE("MPI", 5)
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
MPI_STATUS_IGNORE));
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
POP_RANGE

if (calculate_norm) {
Expand Down Expand Up @@ -366,9 +366,9 @@ int main(int argc, char* argv[]) {

if (rank == 0 && result_correct) {
if (csv) {
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
(stop - start), runtime_serial);
//TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size,
(stop - start), runtime_serial);
} else {
printf("Num GPUs: %d.\n", size);
printf(
Expand Down
18 changes: 9 additions & 9 deletions 08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu
Original file line number Diff line number Diff line change
Expand Up @@ -319,34 +319,34 @@ int main(int argc, char* argv[]) {
CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0));
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0);

launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream);

launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, push_stream);
launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, push_stream);

CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));
CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream));

if (calculate_norm) {
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0));
CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost,
compute_stream));
}

//TODO: Replace MPI communication with Host initiated NVSHMEM calls
//TODO: Replace MPI communication with Host initiated NVSHMEM calls
// Apply periodic boundary conditions
PUSH_RANGE("MPI", 5)
MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0,
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD,
MPI_STATUS_IGNORE));
MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx,
MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE));
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
CUDA_RT_CALL(cudaEventRecord(push_done, push_stream));
POP_RANGE

CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0));

//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)
//TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...)

if (calculate_norm) {
CUDA_RT_CALL(cudaStreamSynchronize(compute_stream));
Expand Down

0 comments on commit 16505f7

Please sign in to comment.