Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check MPI error codes #2385

Merged
merged 18 commits into from
Oct 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ DOLFINx is a new version of DOLFIN and is being actively developed.

Documentation can be viewed at:

- https://docs.fenicsproject.org/dolfinx/main/cpp/
- https://docs.fenicsproject.org/dolfinx/main/python/
- <https://docs.fenicsproject.org/dolfinx/main/cpp/>
- <https://docs.fenicsproject.org/dolfinx/main/python/>

## Installation

Expand Down Expand Up @@ -47,11 +47,12 @@ pip install .
(you may need to use ``pip3``, depending on your system).

For detailed instructions, see
https://docs.fenicsproject.org/dolfinx/main/python/installation.
<https://docs.fenicsproject.org/dolfinx/main/python/installation>.

### Binary

#### Operating System Recommendations

- Mac OS: use [conda](#conda).
- Linux: use [apt](#ubuntu-packages) ([Ubuntu](#ubuntu-packages)/[Debian](#debian-packages)), [docker](#docker-images) or [conda](#conda). See also [Spack](#spack).
- Windows: use [docker](#docker-images), or install Microsoft's [WSL2](https://docs.microsoft.com/en-us/windows/wsl/install) and use [Ubuntu](#ubuntu-packages).
Expand Down Expand Up @@ -170,7 +171,7 @@ docker run -ti dolfinx/dev-env:nightly
All Docker images support arm64 and amd64 architectures.

For a full list of tags, including versioned images, see
https://hub.docker.com/u/dolfinx
<https://hub.docker.com/u/dolfinx>

## Contributing

Expand All @@ -197,14 +198,14 @@ License along with DOLFINx. If not, see

For questions about using DOLFINx, visit the FEniCS Discourse page:

https://fenicsproject.discourse.group/
<https://fenicsproject.discourse.group/>

or use the FEniCS Slack channel:

https://fenicsproject.slack.com/
<https://fenicsproject.slack.com/>

(use https://fenicsproject-slack-invite.herokuapp.com/ to sign up)
(use <https://fenicsproject-slack-invite.herokuapp.com/> to sign up)

For bug reports visit:

https://github.com/FEniCS/dolfinx
<https://github.com/FEniCS/dolfinx>
206 changes: 125 additions & 81 deletions cpp/dolfinx/common/IndexMap.cpp

Large diffs are not rendered by default.

95 changes: 58 additions & 37 deletions cpp/dolfinx/common/MPI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@ dolfinx::MPI::Comm::Comm(MPI_Comm comm, bool duplicate)
if (duplicate and comm != MPI_COMM_NULL)
{
int err = MPI_Comm_dup(comm, &_comm);
if (err != MPI_SUCCESS)
{
throw std::runtime_error(
"Duplication of MPI communicator failed (MPI_Comm_dup)");
}
dolfinx::MPI::check_error(comm, err);
}
else
_comm = comm;
Expand All @@ -42,11 +38,7 @@ dolfinx::MPI::Comm::~Comm()
if (_comm != MPI_COMM_NULL)
{
int err = MPI_Comm_free(&_comm);
if (err != MPI_SUCCESS)
{
std::cout << "Error when destroying communicator (MPI_Comm_free)."
<< std::endl;
}
dolfinx::MPI::check_error(_comm, err);
}
}
//-----------------------------------------------------------------------------
Expand All @@ -57,11 +49,7 @@ dolfinx::MPI::Comm::operator=(dolfinx::MPI::Comm&& comm) noexcept
if (this->_comm != MPI_COMM_NULL)
{
int err = MPI_Comm_free(&this->_comm);
if (err != MPI_SUCCESS)
{
std::cout << "Error when destroying communicator (MPI_Comm_free)."
<< std::endl;
}
dolfinx::MPI::check_error(this->_comm, err);
}

// Move comm from other object
Expand All @@ -75,18 +63,35 @@ MPI_Comm dolfinx::MPI::Comm::comm() const noexcept { return _comm; }
int dolfinx::MPI::rank(const MPI_Comm comm)
{
int rank;
MPI_Comm_rank(comm, &rank);
int err = MPI_Comm_rank(comm, &rank);
dolfinx::MPI::check_error(comm, err);
return rank;
}
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
int dolfinx::MPI::size(const MPI_Comm comm)
{
int size;
MPI_Comm_size(comm, &size);
int err = MPI_Comm_size(comm, &size);
dolfinx::MPI::check_error(comm, err);
return size;
}
//-----------------------------------------------------------------------------
void dolfinx::MPI::check_error(MPI_Comm comm, int code)
{
if (code != MPI_SUCCESS)
{
int len = MPI_MAX_ERROR_STRING;
std::string error_string(MPI_MAX_ERROR_STRING, ' ');
MPI_Error_string(code, error_string.data(), &len);
error_string.resize(len);

std::cerr << error_string << std::endl;
MPI_Abort(comm, code);

std::abort();
}
}
//-----------------------------------------------------------------------------
std::vector<int>
dolfinx::MPI::compute_graph_edges_pcx(MPI_Comm comm,
const std::span<const int>& edges)
Expand All @@ -107,34 +112,42 @@ dolfinx::MPI::compute_graph_edges_pcx(MPI_Comm comm,
std::vector<int> recvcounts(size, 1);
int in_edges = 0;
MPI_Request request_scatter;
MPI_Ireduce_scatter(edge_count_send.data(), &in_edges, recvcounts.data(),
MPI_INT, MPI_SUM, comm, &request_scatter);
int err = MPI_Ireduce_scatter(edge_count_send.data(), &in_edges,
recvcounts.data(), MPI_INT, MPI_SUM, comm,
&request_scatter);
dolfinx::MPI::check_error(comm, err);

std::vector<MPI_Request> send_requests(edges.size());
std::byte send_buffer;
for (std::size_t e = 0; e < edges.size(); ++e)
{
MPI_Isend(&send_buffer, 1, MPI_BYTE, edges[e],
static_cast<int>(tag::consensus_pcx), comm, &send_requests[e]);
int err = MPI_Isend(&send_buffer, 1, MPI_BYTE, edges[e],
static_cast<int>(tag::consensus_pcx), comm,
&send_requests[e]);
dolfinx::MPI::check_error(comm, err);
}

// Probe for incoming messages and store incoming rank
MPI_Wait(&request_scatter, MPI_STATUS_IGNORE);
err = MPI_Wait(&request_scatter, MPI_STATUS_IGNORE);
dolfinx::MPI::check_error(comm, err);
std::vector<int> other_ranks;
while (in_edges > 0)
{
// Check for message
int request_pending;
MPI_Status status;
MPI_Iprobe(MPI_ANY_SOURCE, static_cast<int>(tag::consensus_pcx), comm,
&request_pending, &status);
int err = MPI_Iprobe(MPI_ANY_SOURCE, static_cast<int>(tag::consensus_pcx),
comm, &request_pending, &status);
dolfinx::MPI::check_error(comm, err);
if (request_pending)
{
// Receive message and store rank
int other_rank = status.MPI_SOURCE;
std::byte buffer_recv;
MPI_Recv(&buffer_recv, 1, MPI_BYTE, other_rank,
static_cast<int>(tag::consensus_pcx), comm, MPI_STATUS_IGNORE);
int err = MPI_Recv(&buffer_recv, 1, MPI_BYTE, other_rank,
static_cast<int>(tag::consensus_pcx), comm,
MPI_STATUS_IGNORE);
dolfinx::MPI::check_error(comm, err);
other_ranks.push_back(other_rank);
--in_edges;
}
Expand All @@ -161,8 +174,10 @@ dolfinx::MPI::compute_graph_edges_nbx(MPI_Comm comm,
std::byte send_buffer;
for (std::size_t e = 0; e < edges.size(); ++e)
{
MPI_Issend(&send_buffer, 1, MPI_BYTE, edges[e],
static_cast<int>(tag::consensus_pex), comm, &send_requests[e]);
int err = MPI_Issend(&send_buffer, 1, MPI_BYTE, edges[e],
static_cast<int>(tag::consensus_pex), comm,
&send_requests[e]);
dolfinx::MPI::check_error(comm, err);
}

// Vector to hold ranks that send data to this rank
Expand All @@ -177,38 +192,44 @@ dolfinx::MPI::compute_graph_edges_nbx(MPI_Comm comm,
// Check for message
int request_pending;
MPI_Status status;
MPI_Iprobe(MPI_ANY_SOURCE, static_cast<int>(tag::consensus_pex), comm,
&request_pending, &status);
int err = MPI_Iprobe(MPI_ANY_SOURCE, static_cast<int>(tag::consensus_pex),
comm, &request_pending, &status);
dolfinx::MPI::check_error(comm, err);

// Check if message is waiting to be processed
if (request_pending)
{
// Receive it
int other_rank = status.MPI_SOURCE;
std::byte buffer_recv;
MPI_Recv(&buffer_recv, 1, MPI_BYTE, other_rank,
static_cast<int>(tag::consensus_pex), comm, MPI_STATUS_IGNORE);
int err = MPI_Recv(&buffer_recv, 1, MPI_BYTE, other_rank,
static_cast<int>(tag::consensus_pex), comm,
MPI_STATUS_IGNORE);
dolfinx::MPI::check_error(comm, err);
other_ranks.push_back(other_rank);
}

if (barrier_active)
{
// Check for barrier completion
int flag = 0;
MPI_Test(&barrier_request, &flag, MPI_STATUS_IGNORE);
int err = MPI_Test(&barrier_request, &flag, MPI_STATUS_IGNORE);
dolfinx::MPI::check_error(comm, err);
if (flag)
comm_complete = true;
}
else
{
// Check if all sends have completed
int flag = 0;
MPI_Testall(send_requests.size(), send_requests.data(), &flag,
MPI_STATUSES_IGNORE);
int err = MPI_Testall(send_requests.size(), send_requests.data(), &flag,
MPI_STATUSES_IGNORE);
dolfinx::MPI::check_error(comm, err);
if (flag)
{
// All sends have completed, start non-blocking barrier
MPI_Ibarrier(comm, &barrier_request);
int err = MPI_Ibarrier(comm, &barrier_request);
dolfinx::MPI::check_error(comm, err);
barrier_active = true;
}
}
Expand Down
Loading