Skip to content

Commit

Permalink
[7.4.0] Mark remote cache errors as retriable by expanding `--incompa…
Browse files Browse the repository at this point in the history
…tible_re… (#23426)

…mote_use_new_exit_code_for_lost_inputs`

If `--incompatible_remote_use_new_exit_code_for_lost_inputs` is set, or
if `--experimental_remote_cache_eviction_retries` is set, all remote
cache errors during cache lookup will be (eligible for) being retried by
restarting the build. This does not apply to remote cache errors during
remote execution, though.

Fixes #23033.

Closes #23079.

PiperOrigin-RevId: 667544073
Change-Id: Ia61c6e94b0d64842b930a7fbb62c68d6b86d7ce9

Commit
9392f8f

Co-authored-by: Cornelius Riemenschneider <[email protected]>
  • Loading branch information
iancha1992 and criemen authored Aug 27, 2024
1 parent d131e38 commit 5f5d70b
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -289,17 +289,26 @@ public ListenableFuture<Void> prefetchInputs()
Priority.MEDIUM),
BulkTransferException.class,
(BulkTransferException e) -> {
if (BulkTransferException.allCausedByCacheNotFoundException(e)) {
var code =
(executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnCacheEviction > 0)
? Code.REMOTE_CACHE_EVICTED
: Code.REMOTE_CACHE_FAILED;
if (executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnTransientCacheError > 0) {
var message =
BulkTransferException.allCausedByCacheNotFoundException(e)
? "Failed to fetch blobs because they do not exist remotely."
: "Failed to fetch blobs because of a remote cache error.";
throw new EnvironmentalExecException(
e,
FailureDetail.newBuilder()
.setMessage(message)
.setSpawn(
FailureDetails.Spawn.newBuilder().setCode(Code.REMOTE_CACHE_EVICTED))
.build());
} else if (BulkTransferException.allCausedByCacheNotFoundException(e)) {
throw new EnvironmentalExecException(
e,
FailureDetail.newBuilder()
.setMessage("Failed to fetch blobs because they do not exist remotely.")
.setSpawn(FailureDetails.Spawn.newBuilder().setCode(code))
.setSpawn(
FailureDetails.Spawn.newBuilder().setCode(Code.REMOTE_CACHE_FAILED))
.build());
} else {
throw e;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -511,24 +511,28 @@ public boolean usingLocalTestJobs() {
effectTags = {OptionEffectTag.UNKNOWN},
metadataTags = {OptionMetadataTag.INCOMPATIBLE_CHANGE},
help =
"If set to true, Bazel will use new exit code 39 instead of 34 if remote cache evicts"
+ " blobs during the build.")
"If set to true, Bazel will use new exit code 39 instead of 34 if remote cache"
+ "errors, including cache evictions, cause the build to fail.")
public boolean useNewExitCodeForLostInputs;

@Option(
// TODO: when this flag is moved to non-experimental, rename it to a more general name
// to reflect the new logic - it's not only about cache evictions.
name = "experimental_remote_cache_eviction_retries",
defaultValue = "0",
documentationCategory = OptionDocumentationCategory.REMOTE,
effectTags = {OptionEffectTag.EXECUTION},
help =
"The maximum number of attempts to retry if the build encountered remote cache eviction"
+ " error. A non-zero value will implicitly set"
"The maximum number of attempts to retry if the build encountered a transient remote"
+ " cache error that would otherwise fail the build. Applies for example when"
+ " artifacts are evicted from the remote cache, or in certain cache failure"
+ " conditions. A non-zero value will implicitly set"
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs to true. A new invocation"
+ " id will be generated for each attempt. If you generate invocation id and provide"
+ " it to Bazel with --invocation_id, you should not use this flag. Instead, set flag"
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs and check for the exit"
+ " code 39.")
public int remoteRetryOnCacheEviction;
public int remoteRetryOnTransientCacheError;

/** An enum for specifying different formats of test output. */
public enum TestOutputFormat {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ private SpawnResult handleError(
} else if (remoteCacheFailed) {
status = Status.REMOTE_CACHE_FAILED;
if (executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnCacheEviction > 0) {
|| executionOptions.remoteRetryOnTransientCacheError > 0) {
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_EVICTED;
} else {
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_FAILED;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ public BlazeCommandResult exec(
attemptedCommandIds,
lastResult);
break;
} catch (RemoteCacheEvictedException e) {
} catch (RemoteCacheTransientErrorException e) {
attemptedCommandIds.add(e.getCommandId());
lastResult = e.getResult();
}
Expand Down Expand Up @@ -308,7 +308,7 @@ private BlazeCommandResult execExclusively(
List<Any> commandExtensions,
Set<UUID> attemptedCommandIds,
@Nullable BlazeCommandResult lastResult)
throws RemoteCacheEvictedException {
throws RemoteCacheTransientErrorException {
// Record the start time for the profiler. Do not put anything before this!
long execStartTimeNanos = runtime.getClock().nanoTime();

Expand Down Expand Up @@ -344,7 +344,7 @@ private BlazeCommandResult execExclusively(
env.getCommandId()));
return Preconditions.checkNotNull(lastResult);
} else {
outErr.printErrLn("Found remote cache eviction error, retrying the build...");
outErr.printErrLn("Found transient remote cache error, retrying the build...");
}
}

Expand Down Expand Up @@ -689,13 +689,13 @@ private BlazeCommandResult execExclusively(
if (newResult.getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) {
var executionOptions =
Preconditions.checkNotNull(options.getOptions(ExecutionOptions.class));
if (attemptedCommandIds.size() < executionOptions.remoteRetryOnCacheEviction) {
throw new RemoteCacheEvictedException(env.getCommandId(), newResult);
if (attemptedCommandIds.size() < executionOptions.remoteRetryOnTransientCacheError) {
throw new RemoteCacheTransientErrorException(env.getCommandId(), newResult);
}
}

return newResult;
} catch (RemoteCacheEvictedException e) {
} catch (RemoteCacheTransientErrorException e) {
throw e;
} catch (Throwable e) {
logger.atSevere().withCause(e).log("Shutting down due to exception");
Expand Down Expand Up @@ -730,11 +730,11 @@ private BlazeCommandResult execExclusively(
}
}

private static class RemoteCacheEvictedException extends IOException {
private static class RemoteCacheTransientErrorException extends IOException {
private final UUID commandId;
private final BlazeCommandResult result;

private RemoteCacheEvictedException(UUID commandId, BlazeCommandResult result) {
private RemoteCacheTransientErrorException(UUID commandId, BlazeCommandResult result) {
this.commandId = commandId;
this.result = result;
}
Expand Down
5 changes: 5 additions & 0 deletions src/main/protobuf/failure_details.proto
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,11 @@ message Spawn {
// refactored to prohibit undetailed failures
UNSPECIFIED_EXECUTION_FAILURE = 12 [(metadata) = { exit_code: 1 }];
FORBIDDEN_INPUT = 13 [(metadata) = { exit_code: 1 }];
// This also includes other remote cache errors, not just evictions,
// if --incompatible_remote_use_new_exit_code_for_lost_inputs is set.
// TODO: Rename it to a more general name when
// --experimental_remote_cache_eviction_retries is moved to
// non-experimental.
REMOTE_CACHE_EVICTED = 14 [(metadata) = { exit_code: 39 }];
SPAWN_LOG_IO_EXCEPTION = 15 [(metadata) = { exit_code: 36 }];
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2022,7 +2022,7 @@ EOF
//a:bar >& $TEST_log || fail "Failed to build"

expect_log 'Failed to fetch blobs because they do not exist remotely.'
expect_log "Found remote cache eviction error, retrying the build..."
expect_log "Found transient remote cache error, retrying the build..."

local invocation_ids=$(grep "Invocation ID:" $TEST_log)
local first_id=$(echo "$invocation_ids" | head -n 1)
Expand Down

0 comments on commit 5f5d70b

Please sign in to comment.