From 5b555410164bbc17ad39455242bdaa540aabf180 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 7 Jun 2024 12:18:50 -0400 Subject: [PATCH 1/6] backfill: add builtin boilerplate In anticipation of implementing 'git backfill', populate the necessary files with the boilerplate of a new builtin. RFC TODO: When preparing this for a full implementation, make sure it is based on the newest standards introduced by [1]. [1] https://lore.kernel.org/git/xmqqjzfq2f0f.fsf@gitster.g/T/#m606036ea2e75a6d6819d6b5c90e729643b0ff7f7 [PATCH 1/3] builtin: add a repository parameter for builtin functions Signed-off-by: Derrick Stolee --- .gitignore | 1 + Documentation/git-backfill.txt | 23 +++++++++++++++++++++++ Makefile | 1 + builtin.h | 1 + builtin/backfill.c | 29 +++++++++++++++++++++++++++++ command-list.txt | 1 + git.c | 1 + meson.build | 1 + 8 files changed, 58 insertions(+) create mode 100644 Documentation/git-backfill.txt create mode 100644 builtin/backfill.c diff --git a/.gitignore b/.gitignore index e82aa19df03fc8..95cd94c504460c 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ /git-apply /git-archimport /git-archive +/git-backfill /git-bisect /git-blame /git-branch diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt new file mode 100644 index 00000000000000..640144187d33b7 --- /dev/null +++ b/Documentation/git-backfill.txt @@ -0,0 +1,23 @@ +git-backfill(1) +=============== + +NAME +---- +git-backfill - Download missing objects in a partial clone + + +SYNOPSIS +-------- +[verse] +'git backfill' [] + +DESCRIPTION +----------- + +SEE ALSO +-------- +linkgit:git-clone[1]. + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Makefile b/Makefile index d6d4945188b521..eb94cda0f8fb2b 100644 --- a/Makefile +++ b/Makefile @@ -1208,6 +1208,7 @@ BUILTIN_OBJS += builtin/am.o BUILTIN_OBJS += builtin/annotate.o BUILTIN_OBJS += builtin/apply.o BUILTIN_OBJS += builtin/archive.o +BUILTIN_OBJS += builtin/backfill.o BUILTIN_OBJS += builtin/bisect.o BUILTIN_OBJS += builtin/blame.o BUILTIN_OBJS += builtin/branch.o diff --git a/builtin.h b/builtin.h index f7b166b33484d3..89928ccf92f532 100644 --- a/builtin.h +++ b/builtin.h @@ -120,6 +120,7 @@ int cmd_am(int argc, const char **argv, const char *prefix, struct repository *r int cmd_annotate(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_apply(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_archive(int argc, const char **argv, const char *prefix, struct repository *repo); +int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_bisect(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_blame(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_branch(int argc, const char **argv, const char *prefix, struct repository *repo); diff --git a/builtin/backfill.c b/builtin/backfill.c new file mode 100644 index 00000000000000..38e6aaeaa030ec --- /dev/null +++ b/builtin/backfill.c @@ -0,0 +1,29 @@ +#include "builtin.h" +#include "config.h" +#include "parse-options.h" +#include "repository.h" +#include "object.h" + +static const char * const builtin_backfill_usage[] = { + N_("git backfill []"), + NULL +}; + +int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo) +{ + struct option options[] = { + OPT_END(), + }; + + if (argc == 2 && !strcmp(argv[1], "-h")) + usage_with_options(builtin_backfill_usage, options); + + argc = parse_options(argc, argv, prefix, options, builtin_backfill_usage, + 0); + + repo_config(repo, git_default_config, NULL); + + die(_("not implemented")); + + return 0; +} diff --git a/command-list.txt b/command-list.txt index e0bb87b3b5c278..c537114b4687b8 100644 --- a/command-list.txt +++ b/command-list.txt @@ -60,6 +60,7 @@ git-annotate ancillaryinterrogators git-apply plumbingmanipulators complete git-archimport foreignscminterface git-archive mainporcelain +git-backfill mainporcelain history git-bisect mainporcelain info git-blame ancillaryinterrogators complete git-branch mainporcelain history diff --git a/git.c b/git.c index 71d644dc1c5990..f78484e395d8fc 100644 --- a/git.c +++ b/git.c @@ -506,6 +506,7 @@ static struct cmd_struct commands[] = { { "annotate", cmd_annotate, RUN_SETUP }, { "apply", cmd_apply, RUN_SETUP_GENTLY }, { "archive", cmd_archive, RUN_SETUP_GENTLY }, + { "backfill", cmd_backfill, RUN_SETUP }, { "bisect", cmd_bisect, RUN_SETUP }, { "blame", cmd_blame, RUN_SETUP }, { "branch", cmd_branch, RUN_SETUP | DELAY_PAGER_CONFIG }, diff --git a/meson.build b/meson.build index 5cfe0045fb0d14..842bcedd38af35 100644 --- a/meson.build +++ b/meson.build @@ -487,6 +487,7 @@ builtin_sources = [ 'builtin/annotate.c', 'builtin/apply.c', 'builtin/archive.c', + 'builtin/backfill.c', 'builtin/bisect.c', 'builtin/blame.c', 'builtin/branch.c', From 948399a63f9cacca87946c1fe90c15a389db2a8f Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 12:07:10 -0400 Subject: [PATCH 2/6] backfill: basic functionality and tests The default behavior of 'git backfill' is to fetch all missing blobs that are reachable from HEAD. Document and test this behavior. The implementation is a very simple use of the path-walk API, initializing the revision walk at HEAD to start the path-walk from all commits reachable from HEAD. Ignore the object arrays that correspond to tree entries, assuming that they are all present already. Signed-off-by: Derrick Stolee --- Documentation/git-backfill.txt | 24 +++++ Documentation/technical/api-path-walk.txt | 1 + builtin/backfill.c | 105 +++++++++++++++++++++- t/meson.build | 1 + t/t5620-backfill.sh | 94 +++++++++++++++++++ 5 files changed, 222 insertions(+), 3 deletions(-) create mode 100755 t/t5620-backfill.sh diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt index 640144187d33b7..0e10f066fef56f 100644 --- a/Documentation/git-backfill.txt +++ b/Documentation/git-backfill.txt @@ -14,6 +14,30 @@ SYNOPSIS DESCRIPTION ----------- +Blobless partial clones are created using `git clone --filter=blob:none` +and then configure the local repository such that the Git client avoids +downloading blob objects unless they are required for a local operation. +This initially means that the clone and later fetches download reachable +commits and trees but no blobs. Later operations that change the `HEAD` +pointer, such as `git checkout` or `git merge`, may need to download +missing blobs in order to complete their operation. + +In the worst cases, commands that compute blob diffs, such as `git blame`, +become very slow as they download the missing blobs in single-blob +requests to satisfy the missing object as the Git command needs it. This +leads to multiple download requests and no ability for the Git server to +provide delta compression across those objects. + +The `git backfill` command provides a way for the user to request that +Git downloads the missing blobs (with optional filters) such that the +missing blobs representing historical versions of files can be downloaded +in batches. The `backfill` command attempts to optimize the request by +grouping blobs that appear at the same path, hopefully leading to good +delta compression in the packfile sent by the server. + +By default, `git backfill` downloads all blobs reachable from the `HEAD` +commit. This set can be restricted or expanded using various options. + SEE ALSO -------- linkgit:git-clone[1]. diff --git a/Documentation/technical/api-path-walk.txt b/Documentation/technical/api-path-walk.txt index 2d25281774d43d..a371b9e6e67b84 100644 --- a/Documentation/technical/api-path-walk.txt +++ b/Documentation/technical/api-path-walk.txt @@ -70,4 +70,5 @@ Examples See example usages in: `t/helper/test-path-walk.c`, + `builtin/backfill.c`, `builtin/pack-objects.c` diff --git a/builtin/backfill.c b/builtin/backfill.c index 38e6aaeaa030ec..2c13478a25a95f 100644 --- a/builtin/backfill.c +++ b/builtin/backfill.c @@ -1,16 +1,117 @@ #include "builtin.h" +#include "git-compat-util.h" #include "config.h" #include "parse-options.h" #include "repository.h" +#include "commit.h" +#include "hex.h" +#include "tree.h" +#include "tree-walk.h" #include "object.h" +#include "object-store-ll.h" +#include "oid-array.h" +#include "oidset.h" +#include "promisor-remote.h" +#include "strmap.h" +#include "string-list.h" +#include "revision.h" +#include "trace2.h" +#include "progress.h" +#include "packfile.h" +#include "path-walk.h" static const char * const builtin_backfill_usage[] = { N_("git backfill []"), NULL }; +struct backfill_context { + struct repository *repo; + struct oid_array current_batch; + size_t batch_size; +}; + +static void clear_backfill_context(struct backfill_context *ctx) +{ + oid_array_clear(&ctx->current_batch); +} + +static void download_batch(struct backfill_context *ctx) +{ + promisor_remote_get_direct(ctx->repo, + ctx->current_batch.oid, + ctx->current_batch.nr); + oid_array_clear(&ctx->current_batch); + + /* + * We likely have a new packfile. Add it to the packed list to + * avoid possible duplicate downloads of the same objects. + */ + reprepare_packed_git(ctx->repo); +} + +static int fill_missing_blobs(const char *path UNUSED, + struct oid_array *list, + enum object_type type, + void *data) +{ + struct backfill_context *ctx = data; + + if (type != OBJ_BLOB) + return 0; + + for (size_t i = 0; i < list->nr; i++) { + off_t size = 0; + struct object_info info = OBJECT_INFO_INIT; + info.disk_sizep = &size; + if (oid_object_info_extended(ctx->repo, + &list->oid[i], + &info, + OBJECT_INFO_FOR_PREFETCH) || + !size) + oid_array_append(&ctx->current_batch, &list->oid[i]); + } + + if (ctx->current_batch.nr >= ctx->batch_size) + download_batch(ctx); + + return 0; +} + +static int do_backfill(struct backfill_context *ctx) +{ + struct rev_info revs; + struct path_walk_info info = PATH_WALK_INFO_INIT; + int ret; + + repo_init_revisions(ctx->repo, &revs, ""); + handle_revision_arg("HEAD", &revs, 0, 0); + + info.blobs = 1; + info.tags = info.commits = info.trees = 0; + + info.revs = &revs; + info.path_fn = fill_missing_blobs; + info.path_fn_data = ctx; + + ret = walk_objects_by_path(&info); + + /* Download the objects that did not fill a batch. */ + if (!ret) + download_batch(ctx); + + clear_backfill_context(ctx); + release_revisions(&revs); + return ret; +} + int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo) { + struct backfill_context ctx = { + .repo = repo, + .current_batch = OID_ARRAY_INIT, + .batch_size = 50000, + }; struct option options[] = { OPT_END(), }; @@ -23,7 +124,5 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit repo_config(repo, git_default_config, NULL); - die(_("not implemented")); - - return 0; + return do_backfill(&ctx); } diff --git a/t/meson.build b/t/meson.build index ea89d347f44e5d..8685e9633258fc 100644 --- a/t/meson.build +++ b/t/meson.build @@ -721,6 +721,7 @@ integration_tests = [ 't5617-clone-submodules-remote.sh', 't5618-alternate-refs.sh', 't5619-clone-local-ambiguous-transport.sh', + 't5620-backfill.sh', 't5700-protocol-v1.sh', 't5701-git-serve.sh', 't5702-protocol-v2.sh', diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh new file mode 100755 index 00000000000000..64326362d80f8f --- /dev/null +++ b/t/t5620-backfill.sh @@ -0,0 +1,94 @@ +#!/bin/sh + +test_description='git backfill on partial clones' + +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME + +. ./test-lib.sh + +# We create objects in the 'src' repo. +test_expect_success 'setup repo for object creation' ' + echo "{print \$1}" >print_1.awk && + echo "{print \$2}" >print_2.awk && + + git init src && + + mkdir -p src/a/b/c && + mkdir -p src/d/e && + + for i in 1 2 + do + for n in 1 2 3 4 + do + echo "Version $i of file $n" > src/file.$n.txt && + echo "Version $i of file a/$n" > src/a/file.$n.txt && + echo "Version $i of file a/b/$n" > src/a/b/file.$n.txt && + echo "Version $i of file a/b/c/$n" > src/a/b/c/file.$n.txt && + echo "Version $i of file d/$n" > src/d/file.$n.txt && + echo "Version $i of file d/e/$n" > src/d/e/file.$n.txt && + git -C src add . && + git -C src commit -m "Iteration $n" || return 1 + done + done +' + +# Clone 'src' into 'srv.bare' so we have a bare repo to be our origin +# server for the partial clone. +test_expect_success 'setup bare clone for server' ' + git clone --bare "file://$(pwd)/src" srv.bare && + git -C srv.bare config --local uploadpack.allowfilter 1 && + git -C srv.bare config --local uploadpack.allowanysha1inwant 1 +' + +# do basic partial clone from "srv.bare" +test_expect_success 'do partial clone 1, backfill gets all objects' ' + git clone --no-checkout --filter=blob:none \ + --single-branch --branch=main \ + "file://$(pwd)/srv.bare" backfill1 && + + # Backfill with no options gets everything reachable from HEAD. + GIT_TRACE2_EVENT="$(pwd)/backfill-file-trace" git \ + -C backfill1 backfill && + + # We should have engaged the partial clone machinery + test_trace2_data promisor fetch_count 48 revs2 && + test_line_count = 0 revs2 +' + +. "$TEST_DIRECTORY"/lib-httpd.sh +start_httpd + +test_expect_success 'create a partial clone over HTTP' ' + SERVER="$HTTPD_DOCUMENT_ROOT_PATH/server" && + rm -rf "$SERVER" repo && + git clone --bare "file://$(pwd)/src" "$SERVER" && + test_config -C "$SERVER" uploadpack.allowfilter 1 && + test_config -C "$SERVER" uploadpack.allowanysha1inwant 1 && + + git clone --no-checkout --filter=blob:none \ + "$HTTPD_URL/smart/server" backfill-http +' + +test_expect_success 'backfilling over HTTP succeeds' ' + GIT_TRACE2_EVENT="$(pwd)/backfill-http-trace" git \ + -C backfill-http backfill && + + # We should have engaged the partial clone machinery + test_trace2_data promisor fetch_count 48 rev-list-out && + awk "{print \$1;}" oids && + GIT_TRACE2_EVENT="$(pwd)/walk-trace" git -C backfill-http \ + cat-file --batch-check batch-out && + ! grep missing batch-out +' + +# DO NOT add non-httpd-specific tests here, because the last part of this +# test script is only executed when httpd is available and enabled. + +test_done From 30f3c5e833c833b0e7f1c100b9b59dc1c0d58f7a Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 12:22:10 -0400 Subject: [PATCH 3/6] backfill: add --batch-size= option Users may want to specify a minimum batch size for their needs. This is only a minimum: the path-walk API provides a list of OIDs that correspond to the same path, and thus it is optimal to allow delta compression across those objects in a single server request. We could consider limiting the request to have a maximum batch size in the future. Signed-off-by: Derrick Stolee --- Documentation/git-backfill.txt | 10 +++++++++- builtin/backfill.c | 4 +++- t/t5620-backfill.sh | 18 ++++++++++++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt index 0e10f066fef56f..9b0bae04e9d8f3 100644 --- a/Documentation/git-backfill.txt +++ b/Documentation/git-backfill.txt @@ -9,7 +9,7 @@ git-backfill - Download missing objects in a partial clone SYNOPSIS -------- [verse] -'git backfill' [] +'git backfill' [--batch-size=] DESCRIPTION ----------- @@ -38,6 +38,14 @@ delta compression in the packfile sent by the server. By default, `git backfill` downloads all blobs reachable from the `HEAD` commit. This set can be restricted or expanded using various options. +OPTIONS +------- + +--batch-size=:: + Specify a minimum size for a batch of missing objects to request + from the server. This size may be exceeded by the last set of + blobs seen at a given path. Default batch size is 16,000. + SEE ALSO -------- linkgit:git-clone[1]. diff --git a/builtin/backfill.c b/builtin/backfill.c index 2c13478a25a95f..ba8580e4b5e966 100644 --- a/builtin/backfill.c +++ b/builtin/backfill.c @@ -21,7 +21,7 @@ #include "path-walk.h" static const char * const builtin_backfill_usage[] = { - N_("git backfill []"), + N_("git backfill [--batch-size=]"), NULL }; @@ -113,6 +113,8 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit .batch_size = 50000, }; struct option options[] = { + OPT_INTEGER(0, "batch-size", &ctx.batch_size, + N_("Minimun number of objects to request at a time")), OPT_END(), }; diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh index 64326362d80f8f..32e2bb1c1327fe 100755 --- a/t/t5620-backfill.sh +++ b/t/t5620-backfill.sh @@ -59,6 +59,24 @@ test_expect_success 'do partial clone 1, backfill gets all objects' ' test_line_count = 0 revs2 ' +test_expect_success 'do partial clone 2, backfill batch size' ' + git clone --no-checkout --filter=blob:none \ + --single-branch --branch=main \ + "file://$(pwd)/srv.bare" backfill2 && + + GIT_TRACE2_EVENT="$(pwd)/batch-trace" git \ + -C backfill2 backfill --batch-size=20 && + + # Batches were used + test_trace2_data promisor fetch_count 20 matches && + test_line_count = 2 matches && + test_trace2_data promisor fetch_count 8 revs2 && + test_line_count = 0 revs2 +' + . "$TEST_DIRECTORY"/lib-httpd.sh start_httpd From 1981fcad23721a884d0b73c6774efee89b8b40c1 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 13:39:18 -0400 Subject: [PATCH 4/6] backfill: add --sparse option One way to significantly reduce the cost of a Git clone and later fetches is to use a blobless partial clone and combine that with a sparse-checkout that reduces the paths that need to be populated in the working directory. Not only does this reduce the cost of clones and fetches, the sparse-checkout reduces the number of objects needed to download from a promisor remote. However, history investigations can be expensie as computing blob diffs will trigger promisor remote requests for one object at a time. This can be avoided by downloading the blobs needed for the given sparse-checkout using 'git backfill' and its new '--sparse' mode, at a time that the user is willing to pay that extra cost. Note that this is distinctly different from the '--filter=sparse:' option, as this assumes that the partial clone has all reachable trees and we are using client-side logic to avoid downloading blobs outside of the sparse-checkout cone. This avoids the server-side cost of walking trees while also achieving a similar goal. It also downloads in batches based on similar path names, presenting a resumable download if things are interrupted. This augments the path-walk API to have a possibly-NULL 'pl' member that may point to a 'struct pattern_list'. This could be more general than the sparse-checkout definition at HEAD, but 'git backfill --sparse' is currently the only consumer. Be sure to test this in both cone mode and not cone mode. Cone mode has the benefit that the path-walk can skip certain paths once they would expand beyond the sparse-checkout. Signed-off-by: Derrick Stolee --- Documentation/git-backfill.txt | 6 ++- Documentation/technical/api-path-walk.txt | 8 ++++ builtin/backfill.c | 20 ++++++++- dir.c | 10 ++--- dir.h | 3 ++ path-walk.c | 18 ++++++++ path-walk.h | 11 +++++ t/helper/test-path-walk.c | 21 ++++++++- t/t5620-backfill.sh | 55 +++++++++++++++++++++++ t/t6601-path-walk.sh | 35 +++++++++++++++ 10 files changed, 177 insertions(+), 10 deletions(-) diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt index 9b0bae04e9d8f3..ecf2ac428cefb7 100644 --- a/Documentation/git-backfill.txt +++ b/Documentation/git-backfill.txt @@ -9,7 +9,7 @@ git-backfill - Download missing objects in a partial clone SYNOPSIS -------- [verse] -'git backfill' [--batch-size=] +'git backfill' [--batch-size=] [--[no-]sparse] DESCRIPTION ----------- @@ -46,6 +46,10 @@ OPTIONS from the server. This size may be exceeded by the last set of blobs seen at a given path. Default batch size is 16,000. +--[no-]sparse:: + Only download objects if they appear at a path that matches the + current sparse-checkout. + SEE ALSO -------- linkgit:git-clone[1]. diff --git a/Documentation/technical/api-path-walk.txt b/Documentation/technical/api-path-walk.txt index a371b9e6e67b84..83bfe3d665e9fb 100644 --- a/Documentation/technical/api-path-walk.txt +++ b/Documentation/technical/api-path-walk.txt @@ -65,6 +65,14 @@ better off using the revision walk API instead. the revision walk so that the walk emits commits marked with the `UNINTERESTING` flag. +`pl`:: + This pattern list pointer allows focusing the path-walk search to + a set of patterns, only emitting paths that match the given + patterns. See linkgit:gitignore[5] or + linkgit:git-sparse-checkout[1] for details about pattern lists. + When the pattern list uses cone-mode patterns, then the path-walk + API can prune the set of paths it walks to improve performance. + Examples -------- diff --git a/builtin/backfill.c b/builtin/backfill.c index ba8580e4b5e966..b6b8e093eab1e0 100644 --- a/builtin/backfill.c +++ b/builtin/backfill.c @@ -4,6 +4,7 @@ #include "parse-options.h" #include "repository.h" #include "commit.h" +#include "dir.h" #include "hex.h" #include "tree.h" #include "tree-walk.h" @@ -21,7 +22,7 @@ #include "path-walk.h" static const char * const builtin_backfill_usage[] = { - N_("git backfill [--batch-size=]"), + N_("git backfill [--batch-size=] [--[no-]sparse]"), NULL }; @@ -29,6 +30,7 @@ struct backfill_context { struct repository *repo; struct oid_array current_batch; size_t batch_size; + int sparse; }; static void clear_backfill_context(struct backfill_context *ctx) @@ -84,6 +86,15 @@ static int do_backfill(struct backfill_context *ctx) struct path_walk_info info = PATH_WALK_INFO_INIT; int ret; + if (ctx->sparse) { + CALLOC_ARRAY(info.pl, 1); + if (get_sparse_checkout_patterns(info.pl)) { + clear_pattern_list(info.pl); + free(info.pl); + return error(_("problem loading sparse-checkout")); + } + } + repo_init_revisions(ctx->repo, &revs, ""); handle_revision_arg("HEAD", &revs, 0, 0); @@ -102,6 +113,10 @@ static int do_backfill(struct backfill_context *ctx) clear_backfill_context(ctx); release_revisions(&revs); + if (info.pl) { + clear_pattern_list(info.pl); + free(info.pl); + } return ret; } @@ -111,10 +126,13 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit .repo = repo, .current_batch = OID_ARRAY_INIT, .batch_size = 50000, + .sparse = 0, }; struct option options[] = { OPT_INTEGER(0, "batch-size", &ctx.batch_size, N_("Minimun number of objects to request at a time")), + OPT_BOOL(0, "sparse", &ctx.sparse, + N_("Restrict the missing objects to the current sparse-checkout")), OPT_END(), }; diff --git a/dir.c b/dir.c index 5b2181e5899ce9..16ccfe7e4e868d 100644 --- a/dir.c +++ b/dir.c @@ -1093,10 +1093,6 @@ static void invalidate_directory(struct untracked_cache *uc, dir->dirs[i]->recurse = 0; } -static int add_patterns_from_buffer(char *buf, size_t size, - const char *base, int baselen, - struct pattern_list *pl); - /* Flags for add_patterns() */ #define PATTERN_NOFOLLOW (1<<0) @@ -1186,9 +1182,9 @@ static int add_patterns(const char *fname, const char *base, int baselen, return 0; } -static int add_patterns_from_buffer(char *buf, size_t size, - const char *base, int baselen, - struct pattern_list *pl) +int add_patterns_from_buffer(char *buf, size_t size, + const char *base, int baselen, + struct pattern_list *pl) { char *orig = buf; int i, lineno = 1; diff --git a/dir.h b/dir.h index a3a2f00f5d9273..6cfef5df66091b 100644 --- a/dir.h +++ b/dir.h @@ -467,6 +467,9 @@ void add_patterns_from_file(struct dir_struct *, const char *fname); int add_patterns_from_blob_to_list(struct object_id *oid, const char *base, int baselen, struct pattern_list *pl); +int add_patterns_from_buffer(char *buf, size_t size, + const char *base, int baselen, + struct pattern_list *pl); void parse_path_pattern(const char **string, int *patternlen, unsigned *flags, int *nowildcardlen); void add_pattern(const char *string, const char *base, int baselen, struct pattern_list *pl, int srcpos); diff --git a/path-walk.c b/path-walk.c index dd1acb29d92b45..10c6dedbab82b1 100644 --- a/path-walk.c +++ b/path-walk.c @@ -10,6 +10,7 @@ #include "hex.h" #include "object.h" #include "oid-array.h" +#include "repository.h" #include "revision.h" #include "string-list.h" #include "strmap.h" @@ -119,6 +120,23 @@ static int add_children(struct path_walk_context *ctx, if (type == OBJ_TREE) strbuf_addch(&path, '/'); + if (ctx->info->pl) { + int dtype; + enum pattern_match_result match; + match = path_matches_pattern_list(path.buf, path.len, + path.buf + base_len, &dtype, + ctx->info->pl, + ctx->repo->index); + + if (ctx->info->pl->use_cone_patterns && + match == NOT_MATCHED) + continue; + else if (!ctx->info->pl->use_cone_patterns && + type == OBJ_BLOB && + match != MATCHED) + continue; + } + if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) { CALLOC_ARRAY(list, 1); list->type = type; diff --git a/path-walk.h b/path-walk.h index 3e44c4b8a588e9..090cda3b5cf8f4 100644 --- a/path-walk.h +++ b/path-walk.h @@ -6,6 +6,7 @@ struct rev_info; struct oid_array; +struct pattern_list; /** * The type of a function pointer for the method that is called on a list of @@ -46,6 +47,16 @@ struct path_walk_info { * walk the children of such trees. */ int prune_all_uninteresting; + + /** + * Specify a sparse-checkout definition to match our paths to. Do not + * walk outside of this sparse definition. If the patterns are in + * cone mode, then the search may prune directories that are outside + * of the cone. If not in cone mode, then all tree paths will be + * explored but the path_fn will only be called when the path matches + * the sparse-checkout patterns. + */ + struct pattern_list *pl; }; #define PATH_WALK_INFO_INIT { \ diff --git a/t/helper/test-path-walk.c b/t/helper/test-path-walk.c index fa3bfe46b5de1c..405c0f43be1694 100644 --- a/t/helper/test-path-walk.c +++ b/t/helper/test-path-walk.c @@ -1,6 +1,7 @@ #define USE_THE_REPOSITORY_VARIABLE #include "test-tool.h" +#include "dir.h" #include "environment.h" #include "hex.h" #include "object-name.h" @@ -9,6 +10,7 @@ #include "revision.h" #include "setup.h" #include "parse-options.h" +#include "strbuf.h" #include "path-walk.h" #include "oid-array.h" @@ -67,7 +69,7 @@ static int emit_block(const char *path, struct oid_array *oids, int cmd__path_walk(int argc, const char **argv) { - int res; + int res, stdin_pl = 0; struct rev_info revs = REV_INFO_INIT; struct path_walk_info info = PATH_WALK_INFO_INIT; struct path_walk_test_data data = { 0 }; @@ -82,6 +84,8 @@ int cmd__path_walk(int argc, const char **argv) N_("toggle inclusion of tree objects")), OPT_BOOL(0, "prune", &info.prune_all_uninteresting, N_("toggle pruning of uninteresting paths")), + OPT_BOOL(0, "stdin-pl", &stdin_pl, + N_("read a pattern list over stdin")), OPT_END(), }; @@ -101,6 +105,17 @@ int cmd__path_walk(int argc, const char **argv) info.path_fn = emit_block; info.path_fn_data = &data; + if (stdin_pl) { + struct strbuf in = STRBUF_INIT; + CALLOC_ARRAY(info.pl, 1); + + info.pl->use_cone_patterns = 1; + + strbuf_fread(&in, 2048, stdin); + add_patterns_from_buffer(in.buf, in.len, "", 0, info.pl); + strbuf_release(&in); + } + res = walk_objects_by_path(&info); printf("commits:%" PRIuMAX "\n" @@ -109,6 +124,10 @@ int cmd__path_walk(int argc, const char **argv) "tags:%" PRIuMAX "\n", data.commit_nr, data.tree_nr, data.blob_nr, data.tag_nr); + if (info.pl) { + clear_pattern_list(info.pl); + free(info.pl); + } release_revisions(&revs); return res; } diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh index 32e2bb1c1327fe..c2acd1339bd454 100755 --- a/t/t5620-backfill.sh +++ b/t/t5620-backfill.sh @@ -77,6 +77,61 @@ test_expect_success 'do partial clone 2, backfill batch size' ' test_line_count = 0 revs2 ' +test_expect_success 'backfill --sparse' ' + git clone --sparse --filter=blob:none \ + --single-branch --branch=main \ + "file://$(pwd)/srv.bare" backfill3 && + + # Initial checkout includes four files at root. + git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing && + test_line_count = 44 missing && + + # Initial sparse-checkout is just the files at root, so we get the + # older versions of the four files at tip. + GIT_TRACE2_EVENT="$(pwd)/sparse-trace1" git \ + -C backfill3 backfill --sparse && + test_trace2_data promisor fetch_count 4 missing && + test_line_count = 40 missing && + + # Expand the sparse-checkout to include 'd' recursively. This + # engages the algorithm to skip the trees for 'a'. Note that + # the "sparse-checkout set" command downloads the objects at tip + # to satisfy the current checkout. + git -C backfill3 sparse-checkout set d && + GIT_TRACE2_EVENT="$(pwd)/sparse-trace2" git \ + -C backfill3 backfill --sparse && + test_trace2_data promisor fetch_count 8 missing && + test_line_count = 24 missing +' + +test_expect_success 'backfill --sparse without cone mode' ' + git clone --no-checkout --filter=blob:none \ + --single-branch --branch=main \ + "file://$(pwd)/srv.bare" backfill4 && + + # No blobs yet + git -C backfill4 rev-list --quiet --objects --missing=print HEAD >missing && + test_line_count = 48 missing && + + # Define sparse-checkout by filename regardless of parent directory. + # This downloads 6 blobs to satisfy the checkout. + git -C backfill4 sparse-checkout set --no-cone "**/file.1.txt" && + git -C backfill4 checkout main && + + GIT_TRACE2_EVENT="$(pwd)/no-cone-trace1" git \ + -C backfill4 backfill --sparse && + test_trace2_data promisor fetch_count 6 missing && + test_line_count = 36 missing +' + . "$TEST_DIRECTORY"/lib-httpd.sh start_httpd diff --git a/t/t6601-path-walk.sh b/t/t6601-path-walk.sh index 943adc6c8f132f..312bf3c19c176a 100755 --- a/t/t6601-path-walk.sh +++ b/t/t6601-path-walk.sh @@ -108,6 +108,41 @@ test_expect_success 'all' ' test_cmp expect.sorted out.sorted ' +test_expect_success 'base & topic, sparse' ' + cat >patterns <<-EOF && + /* + !/*/ + /left/ + EOF + + test-tool path-walk --stdin-pl -- base topic out && + + cat >expect <<-EOF && + COMMIT::$(git rev-parse topic) + COMMIT::$(git rev-parse base) + COMMIT::$(git rev-parse base~1) + COMMIT::$(git rev-parse base~2) + commits:4 + TREE::$(git rev-parse topic^{tree}) + TREE::$(git rev-parse base^{tree}) + TREE::$(git rev-parse base~1^{tree}) + TREE::$(git rev-parse base~2^{tree}) + TREE:left/:$(git rev-parse base:left) + TREE:left/:$(git rev-parse base~2:left) + trees:6 + BLOB:a:$(git rev-parse base~2:a) + BLOB:left/b:$(git rev-parse base~2:left/b) + BLOB:left/b:$(git rev-parse base:left/b) + blobs:3 + tags:0 + EOF + + sort expect >expect.sorted && + sort out >out.sorted && + + test_cmp expect.sorted out.sorted +' + test_expect_success 'topic only' ' test-tool path-walk -- topic >out && From 14804efc7922e3466e76b3714eda0e9cebb9607d Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 14:06:11 -0400 Subject: [PATCH 5/6] backfill: assume --sparse when sparse-checkout is enabled The previous change introduced the '--[no-]sparse' option for the 'git backfill' command, but did not assume it as enabled by default. However, this is likely the behavior that users will most often want to happen. Without this default, users with a small sparse-checkout may be confused when 'git backfill' downloads every version of every object in the full history. However, this is left as a separate change so this decision can be reviewed independently of the value of the '--[no-]sparse' option. Add a test of adding the '--sparse' option to a repo without sparse-checkout to make it clear that supplying it without a sparse-checkout is an error. Signed-off-by: Derrick Stolee --- Documentation/git-backfill.txt | 3 ++- builtin/backfill.c | 6 ++++++ t/t5620-backfill.sh | 13 ++++++++++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt index ecf2ac428cefb7..066ec6b161a22c 100644 --- a/Documentation/git-backfill.txt +++ b/Documentation/git-backfill.txt @@ -48,7 +48,8 @@ OPTIONS --[no-]sparse:: Only download objects if they appear at a path that matches the - current sparse-checkout. + current sparse-checkout. If the sparse-checkout feature is enabled, + then `--sparse` is assumed and can be disabled with `--no-sparse`. SEE ALSO -------- diff --git a/builtin/backfill.c b/builtin/backfill.c index b6b8e093eab1e0..b893369ac67313 100644 --- a/builtin/backfill.c +++ b/builtin/backfill.c @@ -1,3 +1,5 @@ +#define USE_THE_REPOSITORY_VARIABLE /* for core_apply_sparse_checkout */ + #include "builtin.h" #include "git-compat-util.h" #include "config.h" @@ -5,6 +7,7 @@ #include "repository.h" #include "commit.h" #include "dir.h" +#include "environment.h" #include "hex.h" #include "tree.h" #include "tree-walk.h" @@ -144,5 +147,8 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit repo_config(repo, git_default_config, NULL); + if (ctx.sparse < 0) + ctx.sparse = core_apply_sparse_checkout; + return do_backfill(&ctx); } diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh index c2acd1339bd454..eecf03d5199ea9 100755 --- a/t/t5620-backfill.sh +++ b/t/t5620-backfill.sh @@ -77,6 +77,12 @@ test_expect_success 'do partial clone 2, backfill batch size' ' test_line_count = 0 revs2 ' +test_expect_success 'backfill --sparse without sparse-checkout fails' ' + git init not-sparse && + test_must_fail git -C not-sparse backfill --sparse 2>err && + grep "problem loading sparse-checkout" err +' + test_expect_success 'backfill --sparse' ' git clone --sparse --filter=blob:none \ --single-branch --branch=main \ @@ -105,7 +111,12 @@ test_expect_success 'backfill --sparse' ' test_trace2_data promisor fetch_count 8 missing && - test_line_count = 24 missing + test_line_count = 24 missing && + + # Disabling the --sparse option (on by default) will download everything + git -C backfill3 backfill --no-sparse && + git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing && + test_line_count = 0 missing ' test_expect_success 'backfill --sparse without cone mode' ' From 086487d9bec0b9397e5d9d31e686b626daffff26 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Thu, 26 Sep 2024 02:51:06 +0200 Subject: [PATCH 6/6] backfill: mark it as experimental This is a highly useful command, and we want it to get some testing "in the wild". However, the patches have not yet been reviewed on the Git mailing list, and are therefore subject to change. By marking the command as experimental, users will be warned to pay attention to those changes. Signed-off-by: Johannes Schindelin --- Documentation/git-backfill.txt | 2 +- builtin/backfill.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt index 066ec6b161a22c..2e1f8505209cbb 100644 --- a/Documentation/git-backfill.txt +++ b/Documentation/git-backfill.txt @@ -9,7 +9,7 @@ git-backfill - Download missing objects in a partial clone SYNOPSIS -------- [verse] -'git backfill' [--batch-size=] [--[no-]sparse] +(EXPERIMENTAL) 'git backfill' [--batch-size=] [--[no-]sparse] DESCRIPTION ----------- diff --git a/builtin/backfill.c b/builtin/backfill.c index b893369ac67313..6fab33fccf4f7e 100644 --- a/builtin/backfill.c +++ b/builtin/backfill.c @@ -25,7 +25,7 @@ #include "path-walk.h" static const char * const builtin_backfill_usage[] = { - N_("git backfill [--batch-size=] [--[no-]sparse]"), + N_("(EXPERIMENTAL) git backfill [--batch-size=] [--[no-]sparse]"), NULL };