Skip to content

Commit

Permalink
Rewrite error message when Pantsd is shut down during run (pantsbuild…
Browse files Browse the repository at this point in the history
…#12107)

Users don't know what Nailgun is, which is an implementation detail. We should mention Pantsd so that they have some context what's going on, along with suggesting the possibility of OOM kills and a possible remedy.

Before:

> native_engine.NailgunClientException: Nailgun client error: "Nailgun client error: Client exited before the server\'s result could be returned."

After:

> native_engine.PantsdClientException: the pantsd process was killed during the run.
>
> If this was not intentionally done by you, Pants may have been killed by the operating system due to memory overconsumption (i.e. OOM-killed). You can set the global option `--pantsd-max-memory-usage` to reduce Pantsd's memory consumption by retaining less in its in-memory cache (run `./pants help-advanced global`). You can also disable pantsd with the global option `--pantsd` to avoid persisting memory across Pants runs, although you will miss out on additional caching.
>
> If neither of those help, please consider filing a GitHub issue or reaching out on Slack so that we can investigate the possible memory overconsumption (https://www.pantsbuild.org/docs/getting-help).

[ci skip-build-wheels]
  • Loading branch information
Eric-Arellano committed May 24, 2021
1 parent 1696576 commit 26c1f0e
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 28 deletions.
8 changes: 3 additions & 5 deletions src/python/pants/bin/remote_pants_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from pants.base.exiter import ExitCode
from pants.engine.internals import native_engine
from pants.engine.internals.native_engine import NailgunConnectionException
from pants.engine.internals.native_engine import PantsdConnectionException
from pants.nailgun.nailgun_protocol import NailgunProtocol
from pants.option.global_options import GlobalOptions
from pants.option.options_bootstrapper import OptionsBootstrapper
Expand Down Expand Up @@ -132,10 +132,8 @@ def _connect_and_execute(self, pantsd_handle: PantsDaemonClient.Handle) -> ExitC
return native_engine.nailgun_client_create(executor, port).execute(
command, args, modified_env
)

# NailgunConnectionException represents a failure connecting to pantsd, so we retry
# up to the retry limit.
except NailgunConnectionException as e:
# Retry if we failed to connect to Pantsd.
except PantsdConnectionException as e:
if attempt > retries:
raise self.Fallback(e)

Expand Down
5 changes: 4 additions & 1 deletion src/python/pants/engine/internals/native_engine.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,10 @@ class PyStubCAS:
class PyStdioDestination:
pass

class NailgunConnectionException(Exception):
class PantsdConnectionException(Exception):
pass

class PantsdClientException(Exception):
pass

class PollTimeout(Exception):
Expand Down
9 changes: 7 additions & 2 deletions src/python/pants/option/global_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,8 +711,13 @@ def register_bootstrap_options(cls, register):
type=int,
default=2 ** 30,
help=(
"The maximum memory usage of a pantsd process (in bytes). There is at most one "
"pantsd process per workspace."
"The maximum memory usage of the pantsd process (in bytes).\n\n"
"When the maximum memory is exceeded, the daemon will restart gracefully, "
"although all previous in-memory caching will be lost. Setting too low means that "
"you may miss out on some caching, whereas setting too high may over-consume "
"resources and may result in the operating system killing Pantsd due to memory "
"overconsumption (e.g. via the OOM killer).\n\n"
"There is at most one pantsd process per workspace."
),
)

Expand Down
29 changes: 20 additions & 9 deletions src/rust/engine/nailgun/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,7 @@ pub async fn client_execute(
let socket = TcpStream::connect((Ipv4Addr::new(127, 0, 0, 1), port))
.await
.map_err(|err| {
NailgunClientError::PreConnect(format!(
"Nailgun client error connecting to localhost: {}",
err
))
NailgunClientError::PreConnect(format!("Failed to connect to localhost: {}", err))
})?;

let mut child = nails::client::handle_connection(config, socket, command, async {
Expand All @@ -155,7 +152,7 @@ pub async fn client_execute(
stdin_read
})
.await
.map_err(|err| NailgunClientError::PreConnect(format!("Failed to start remote task: {}", err)))?;
.map_err(|err| NailgunClientError::PreConnect(format!("Failed to start: {}", err)))?;

handle_client_output(
child.output_stream.take().unwrap(),
Expand All @@ -164,10 +161,24 @@ pub async fn client_execute(
)
.await?;

let exit_code: ExitCode = child
.wait()
.await
.map_err(|err| NailgunClientError::PostConnect(format!("Nailgun client error: {}", err)))?;
let exit_code: ExitCode = child.wait().await.map_err(|err| {
let err_str = match err.to_string().as_str() {
"Client exited before the server's result could be returned." => {
"The pantsd process was killed during the run.\n\nIf this was not intentionally done by you, \
Pants may have been killed by the operating system due to memory overconsumption \
(i.e. OOM-killed). You can set the global option `--pantsd-max-memory-usage` to reduce \
Pantsd's memory consumption by retaining less in its in-memory cache \
(run `./pants help-advanced global`). You can also disable pantsd with the global \
option `--pantsd` to avoid persisting memory across Pants runs, although you will miss \
out on additional caching.\n\nIf neither of those help, please consider filing a \
GitHub issue or reaching out on Slack so that we can investigate the possible memory \
overconsumption (https://www.pantsbuild.org/docs/getting-help)."
.to_owned()
}
_ => format!("Failed during execution: {}", err),
};
NailgunClientError::PostConnect(err_str)
})?;

Ok(exit_code.0)
}
19 changes: 8 additions & 11 deletions src/rust/engine/src/externs/interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,22 +84,22 @@ use crate::{
mod testutil;

py_exception!(native_engine, PollTimeout);
py_exception!(native_engine, NailgunConnectionException);
py_exception!(native_engine, NailgunClientException);
py_exception!(native_engine, PantsdConnectionException);
py_exception!(native_engine, PantsdClientException);

py_module_initializer!(native_engine, |py, m| {
m.add(py, "PollTimeout", py.get_type::<PollTimeout>())
.unwrap();

m.add(
py,
"NailgunClientException",
py.get_type::<NailgunClientException>(),
"PantsdClientException",
py.get_type::<PantsdClientException>(),
)?;
m.add(
py,
"NailgunConnectionException",
py.get_type::<NailgunConnectionException>(),
"PantsdConnectionException",
py.get_type::<PantsdConnectionException>(),
)?;

m.add(py, "default_cache_path", py_fn!(py, default_cache_path()))?;
Expand Down Expand Up @@ -699,11 +699,8 @@ py_class!(class PyNailgunClient |py| {
args,
env_list,
)).map(|code| code.to_py_object(py)).map_err(|e| match e{
NailgunClientError::PreConnect(err_str) => PyErr::new::<NailgunConnectionException, _>(py, (err_str,)),
NailgunClientError::PostConnect(s) => {
let err_str = format!("Nailgun client error: {:?}", s);
PyErr::new::<NailgunClientException, _>(py, (err_str,))
},
NailgunClientError::PreConnect(err_str) => PyErr::new::<PantsdConnectionException, _>(py, (err_str,)),
NailgunClientError::PostConnect(err_str) => PyErr::new::<PantsdClientException, _>(py, (err_str,)),
NailgunClientError::BrokenPipe => {
PyErr::new::<exc::BrokenPipeError, _>(py, NoArgs)
}
Expand Down

0 comments on commit 26c1f0e

Please sign in to comment.