Skip to content

Commit

Permalink
Make HTTP cache works with cache eviction.
Browse files Browse the repository at this point in the history
Previously, it's possible for an HTTP cache to delete CAS entries referenced by AC without deleting the AC itself because HTTP cache doesn't understand the relationship between AC and CAS. This could result in permanent build errors because Bazel always trust the AC from remote cache assuming all referenced CAS entries exist.

Now, we record the digest of lost blobs before build rewinding, so that during the next build, we can ignore the stale AC and continue with execution.

Fixes #18696.

RELNOTES: Added support for using a remote cache that evicts blobs and doesn't have AC integrity check (e.g. HTTP cache).
PiperOrigin-RevId: 672536163
Change-Id: Ic1271431d28333f6d86e5963542d15a133075157
  • Loading branch information
coeuvre authored and iancha1992 committed Sep 9, 2024
1 parent 846802a commit ee47066
Show file tree
Hide file tree
Showing 14 changed files with 220 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -536,8 +536,9 @@ private Completable downloadFileNoCheckRx(
})
.doOnError(
error -> {
if (error instanceof CacheNotFoundException) {
reporter.post(new LostInputsEvent());
if (error instanceof CacheNotFoundException cacheNotFoundException) {
reporter.post(
new LostInputsEvent(cacheNotFoundException.getMissingDigest()));
}
}));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.util.concurrent.MoreExecutors.directExecutor;

import build.bazel.remote.execution.v2.Digest;
import com.google.common.base.Preconditions;
import com.google.common.util.concurrent.ListeningScheduledExecutorService;
import com.google.devtools.build.lib.exec.ExecutionOptions;
Expand All @@ -33,6 +34,7 @@
import com.google.devtools.build.lib.runtime.CommandEnvironment;
import com.google.devtools.build.lib.vfs.OutputService;
import com.google.devtools.build.lib.vfs.Path;
import java.util.Set;
import java.util.concurrent.Executor;
import javax.annotation.Nullable;

Expand All @@ -51,6 +53,7 @@ final class RemoteActionContextProvider {
@Nullable private RemoteActionInputFetcher actionInputFetcher;
@Nullable private final RemoteOutputChecker remoteOutputChecker;
@Nullable private final OutputService outputService;
private final Set<Digest> knownMissingCasDigests;

private RemoteActionContextProvider(
Executor executor,
Expand All @@ -61,7 +64,8 @@ private RemoteActionContextProvider(
DigestUtil digestUtil,
@Nullable Path logDir,
@Nullable RemoteOutputChecker remoteOutputChecker,
@Nullable OutputService outputService) {
@Nullable OutputService outputService,
Set<Digest> knownMissingCasDigests) {
this.executor = executor;
this.env = Preconditions.checkNotNull(env, "env");
this.remoteCache = remoteCache;
Expand All @@ -71,12 +75,14 @@ private RemoteActionContextProvider(
this.logDir = logDir;
this.remoteOutputChecker = remoteOutputChecker;
this.outputService = outputService;
this.knownMissingCasDigests = knownMissingCasDigests;
}

public static RemoteActionContextProvider createForPlaceholder(
CommandEnvironment env,
ListeningScheduledExecutorService retryScheduler,
DigestUtil digestUtil) {
DigestUtil digestUtil,
Set<Digest> knownMissingCasDigests) {
return new RemoteActionContextProvider(
directExecutor(),
env,
Expand All @@ -86,7 +92,8 @@ public static RemoteActionContextProvider createForPlaceholder(
digestUtil,
/* logDir= */ null,
/* remoteOutputChecker= */ null,
/* outputService= */ null);
/* outputService= */ null,
knownMissingCasDigests);
}

public static RemoteActionContextProvider createForRemoteCaching(
Expand All @@ -96,7 +103,8 @@ public static RemoteActionContextProvider createForRemoteCaching(
ListeningScheduledExecutorService retryScheduler,
DigestUtil digestUtil,
@Nullable RemoteOutputChecker remoteOutputChecker,
OutputService outputService) {
OutputService outputService,
Set<Digest> knownMissingCasDigests) {
return new RemoteActionContextProvider(
executor,
env,
Expand All @@ -106,7 +114,8 @@ public static RemoteActionContextProvider createForRemoteCaching(
digestUtil,
/* logDir= */ null,
remoteOutputChecker,
checkNotNull(outputService));
checkNotNull(outputService),
knownMissingCasDigests);
}

public static RemoteActionContextProvider createForRemoteExecution(
Expand All @@ -118,7 +127,8 @@ public static RemoteActionContextProvider createForRemoteExecution(
DigestUtil digestUtil,
Path logDir,
@Nullable RemoteOutputChecker remoteOutputChecker,
OutputService outputService) {
OutputService outputService,
Set<Digest> knownMissingCasDigests) {
return new RemoteActionContextProvider(
executor,
env,
Expand All @@ -128,7 +138,8 @@ public static RemoteActionContextProvider createForRemoteExecution(
digestUtil,
logDir,
remoteOutputChecker,
checkNotNull(outputService));
checkNotNull(outputService),
knownMissingCasDigests);
}

private RemotePathResolver createRemotePathResolver() {
Expand Down Expand Up @@ -180,7 +191,8 @@ private RemoteExecutionService getRemoteExecutionService() {
tempPathGenerator,
captureCorruptedOutputsDir,
remoteOutputChecker,
outputService);
outputService,
knownMissingCasDigests);
env.getEventBus().register(remoteExecutionService);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ private ListenableFuture<Void> uploadBlob(
// If we get here, the remote input was determined to exist in the remote or disk cache at
// some point before action execution, but reported to be missing when querying the remote
// for missing action inputs; possibly because it was evicted in the interim.
reporter.post(new LostInputsEvent());
reporter.post(new LostInputsEvent(digest));
throw new CacheNotFoundException(digest, path.getPathString());
}
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
import com.google.devtools.build.lib.actions.SpawnResult;
import com.google.devtools.build.lib.actions.Spawns;
import com.google.devtools.build.lib.analysis.platform.PlatformUtils;
import com.google.devtools.build.lib.buildtool.buildevent.BuildCompleteEvent;
import com.google.devtools.build.lib.buildtool.buildevent.BuildInterruptedEvent;
import com.google.devtools.build.lib.events.Event;
import com.google.devtools.build.lib.events.Reporter;
Expand All @@ -88,6 +89,7 @@
import com.google.devtools.build.lib.remote.RemoteExecutionService.ActionResultMetadata.SymlinkMetadata;
import com.google.devtools.build.lib.remote.Scrubber.SpawnScrubber;
import com.google.devtools.build.lib.remote.common.BulkTransferException;
import com.google.devtools.build.lib.remote.common.LostInputsEvent;
import com.google.devtools.build.lib.remote.common.OperationObserver;
import com.google.devtools.build.lib.remote.common.OutputDigestMismatchException;
import com.google.devtools.build.lib.remote.common.ProgressStatusListener;
Expand Down Expand Up @@ -189,6 +191,7 @@ public class RemoteExecutionService {
private final OutputService outputService;

@Nullable private final Scrubber scrubber;
private final Set<Digest> knownMissingCasDigests;

public RemoteExecutionService(
Executor executor,
Expand All @@ -205,7 +208,8 @@ public RemoteExecutionService(
TempPathGenerator tempPathGenerator,
@Nullable Path captureCorruptedOutputsDir,
@Nullable RemoteOutputChecker remoteOutputChecker,
OutputService outputService) {
OutputService outputService,
Set<Digest> knownMissingCasDigests) {
this.reporter = reporter;
this.verboseFailures = verboseFailures;
this.execRoot = execRoot;
Expand All @@ -231,6 +235,7 @@ public RemoteExecutionService(
this.scheduler = Schedulers.from(executor, /* interruptibleWorker= */ true);
this.remoteOutputChecker = remoteOutputChecker;
this.outputService = outputService;
this.knownMissingCasDigests = knownMissingCasDigests;
}

private Command buildCommand(
Expand Down Expand Up @@ -648,6 +653,7 @@ public static class RemoteActionResult {
private final ActionResult actionResult;
@Nullable private final ExecuteResponse executeResponse;
@Nullable private final String cacheName;
@Nullable private ActionResultMetadata metadata;

/** Creates a new {@link RemoteActionResult} instance from a cached result. */
public static RemoteActionResult createFromCache(CachedActionResult cachedActionResult) {
Expand Down Expand Up @@ -688,8 +694,20 @@ public List<OutputDirectory> getOutputDirectories() {
return actionResult.getOutputDirectoriesList();
}

public int getOutputDirectoriesCount() {
return actionResult.getOutputDirectoriesCount();
public ActionResultMetadata getOrParseActionResultMetadata(
RemoteCache remoteCache,
DigestUtil digestUtil,
RemoteActionExecutionContext context,
RemotePathResolver remotePathResolver)
throws IOException, InterruptedException {
if (metadata == null) {
try (SilentCloseable c = Profiler.instance().profile("Remote.parseActionResultMetadata")) {
metadata =
parseActionResultMetadata(
remoteCache, digestUtil, context, actionResult, remotePathResolver);
}
}
return metadata;
}

public List<OutputSymlink> getOutputDirectorySymlinks() {
Expand Down Expand Up @@ -783,7 +801,53 @@ public RemoteActionResult lookupCache(RemoteAction action)
return null;
}

return RemoteActionResult.createFromCache(cachedActionResult);
var result = RemoteActionResult.createFromCache(cachedActionResult);

// We only add digests to `knownMissingCasDigests` when LostInputsEvent occurs which will cause
// the build to abort and rewind, so there is no data race here. This allows us to avoid the
// check until cache eviction happens.
if (!knownMissingCasDigests.isEmpty()) {
var metadata =
result.getOrParseActionResultMetadata(
remoteCache,
digestUtil,
action.getRemoteActionExecutionContext(),
action.getRemotePathResolver());

// If we already know digests referenced by this AC is missing from remote cache, ignore it so
// that we can fall back to execution. This could happen when the remote cache is an HTTP
// cache, or doesn't implement AC integrity check.
//
// See https://github.com/bazelbuild/bazel/issues/18696.
if (updateKnownMissingCasDigests(knownMissingCasDigests, metadata)) {
return null;
}
}

return result;
}

/**
* Removes digests referenced by {@code metadata} from {@code knownMissingCasDigests} and returns
* whether any were removed
*/
private static boolean updateKnownMissingCasDigests(
Set<Digest> knownMissingCasDigests, ActionResultMetadata metadata) {
// Using `remove` below because we assume the missing blob will be uploaded afterwards.
var result = false;
for (var file : metadata.files()) {
if (knownMissingCasDigests.remove(file.digest())) {
result = true;
}
}
for (var entry : metadata.directories()) {
for (var file : entry.getValue().files()) {
if (knownMissingCasDigests.remove(file.digest())) {
result = true;
}
}
}
return result;
}

private ListenableFuture<FileMetadata> downloadFile(
Expand Down Expand Up @@ -995,7 +1059,7 @@ public Collection<SymlinkMetadata> symlinks() {
}
}

private DirectoryMetadata parseDirectory(
private static DirectoryMetadata parseDirectory(
Path parent, Directory dir, Map<Digest, Directory> childDirectoriesMap) {
ImmutableList.Builder<FileMetadata> filesBuilder = ImmutableList.builder();
for (FileNode file : dir.getFilesList()) {
Expand Down Expand Up @@ -1023,16 +1087,18 @@ private DirectoryMetadata parseDirectory(
return new DirectoryMetadata(filesBuilder.build(), symlinksBuilder.build());
}

ActionResultMetadata parseActionResultMetadata(
static ActionResultMetadata parseActionResultMetadata(
RemoteCache remoteCache,
DigestUtil digestUtil,
RemoteActionExecutionContext context,
RemoteActionResult result,
ActionResult result,
RemotePathResolver remotePathResolver)
throws IOException, InterruptedException {
checkNotNull(remoteCache, "remoteCache can't be null");

Map<Path, ListenableFuture<Tree>> dirMetadataDownloads =
Maps.newHashMapWithExpectedSize(result.getOutputDirectoriesCount());
for (OutputDirectory dir : result.getOutputDirectories()) {
for (OutputDirectory dir : result.getOutputDirectoriesList()) {
var outputPath = encodeBytestringUtf8(dir.getPath());
dirMetadataDownloads.put(
remotePathResolver.outputPathToLocalPath(outputPath),
Expand All @@ -1059,7 +1125,7 @@ ActionResultMetadata parseActionResultMetadata(
}

ImmutableMap.Builder<Path, FileMetadata> files = ImmutableMap.builder();
for (OutputFile outputFile : result.getOutputFiles()) {
for (OutputFile outputFile : result.getOutputFilesList()) {
Path localPath =
remotePathResolver.outputPathToLocalPath(encodeBytestringUtf8(outputFile.getPath()));
files.put(
Expand All @@ -1070,9 +1136,9 @@ ActionResultMetadata parseActionResultMetadata(
var symlinkMap = new HashMap<Path, SymlinkMetadata>();
var outputSymlinks =
Iterables.concat(
result.getOutputFileSymlinks(),
result.getOutputDirectorySymlinks(),
result.getOutputSymlinks());
result.getOutputFileSymlinksList(),
result.getOutputDirectorySymlinksList(),
result.getOutputSymlinksList());
for (var symlink : outputSymlinks) {
var localPath =
remotePathResolver.outputPathToLocalPath(encodeBytestringUtf8(symlink.getPath()));
Expand Down Expand Up @@ -1132,10 +1198,9 @@ public InMemoryOutput downloadOutputs(RemoteAction action, RemoteActionResult re
context = context.withReadCachePolicy(context.getReadCachePolicy().addRemoteCache());
}

ActionResultMetadata metadata;
try (SilentCloseable c = Profiler.instance().profile("Remote.parseActionResultMetadata")) {
metadata = parseActionResultMetadata(context, result, action.getRemotePathResolver());
}
ActionResultMetadata metadata =
result.getOrParseActionResultMetadata(
remoteCache, digestUtil, context, action.getRemotePathResolver());

// The expiration time for remote cache entries.
var expireAtEpochMilli = Instant.now().plus(remoteOptions.remoteCacheTtl).toEpochMilli();
Expand Down Expand Up @@ -1205,6 +1270,7 @@ public InMemoryOutput downloadOutputs(RemoteAction action, RemoteActionResult re
if (realToTmpPath.containsKey(file.path)) {
continue;
}

if (shouldDownload(result, file.path.relativeTo(execRoot))) {
Path tmpPath = tempPathGenerator.generateTempPath();
realToTmpPath.put(file.path, tmpPath);
Expand Down Expand Up @@ -1308,6 +1374,12 @@ public InMemoryOutput downloadOutputs(RemoteAction action, RemoteActionResult re
}
}

if (result.executeResponse != null && !knownMissingCasDigests.isEmpty()) {
// A succeeded execution uploads outputs to CAS. Refresh our knowledge about missing
// digests.
var unused = updateKnownMissingCasDigests(knownMissingCasDigests, metadata);
}

// When downloading outputs from just remotely executed action, the action result comes from
// Execution response which means, if disk cache is enabled, action result hasn't been
// uploaded to it. Upload action result to disk cache here so next build could hit it.
Expand Down Expand Up @@ -1591,6 +1663,20 @@ public void onBuildInterrupted(BuildInterruptedEvent event) {
buildInterrupted.set(true);
}

@Subscribe
public void onBuildComplete(BuildCompleteEvent event) {
if (event.getResult().getSuccess()) {
// If build succeeded, clear knownMissingCasDigests in case there are missing digests from
// other targets from previous builds which are not relevant anymore.
knownMissingCasDigests.clear();
}
}

@Subscribe
public void onLostInputs(LostInputsEvent event) {
knownMissingCasDigests.add(event.getMissingDigest());
}

/**
* Shuts the service down. Wait for active network I/O to finish but new requests are rejected.
*/
Expand Down Expand Up @@ -1625,7 +1711,6 @@ public void shutdown() {
}

void report(Event evt) {

synchronized (this) {
if (reportedErrors.contains(evt.getMessage())) {
return;
Expand Down
Loading

0 comments on commit ee47066

Please sign in to comment.