]> git.ipfire.org Git - thirdparty/git.git/commitdiff
builtin/log: prefetch necessary blobs for `git cherry`
authorElijah Newren <newren@gmail.com>
Thu, 14 May 2026 16:25:27 +0000 (16:25 +0000)
committerJunio C Hamano <gitster@pobox.com>
Thu, 14 May 2026 19:24:54 +0000 (04:24 +0900)
In partial clones, `git cherry` fetches necessary blobs on-demand one
at a time, which can be very slow.  We would like to prefetch all
necessary blobs upfront.  To do so, we need to be able to first figure
out which blobs are needed.

`git cherry` does its work in a two-phase approach: first computing
header-only IDs (based on file paths and modes), then falling back to
full content-based IDs only when header-only IDs collide -- or, more
accurately, whenever the oidhash() of the header-only object_ids
collide.

patch-ids.c handles this by creating an ids->patches hashmap that has
all the data we need, but the problem is that any attempt to query the
hashmap will invoke the patch_id_neq() function on any colliding objects,
which causes the on-demand fetching.

Insert a new prefetch_cherry_blobs() function before checking for
collisions.  Use a temporary replacement on the ids->patches.cmpfn
in order to enumerate the blobs that would be needed without yet
fetching them, and then fetch them all at once, then restore the old
ids->patches.cmpfn.

Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
builtin/log.c
t/t3500-cherry.sh

index 8c0939dd42ada28bc4b7bd60aa33e6806dfbb067..e464b30af4bcaeefcef23f480c89de5fc2cf6b75 100644 (file)
 #include "color.h"
 #include "commit.h"
 #include "diff.h"
+#include "diffcore.h"
 #include "diff-merges.h"
 #include "revision.h"
 #include "log-tree.h"
 #include "oid-array.h"
+#include "oidset.h"
 #include "tag.h"
 #include "reflog-walk.h"
 #include "patch-ids.h"
 #include "utf8.h"
 
 #include "commit-reach.h"
+#include "promisor-remote.h"
 #include "range-diff.h"
 #include "tmp-objdir.h"
 #include "tree.h"
+#include "userdiff.h"
 #include "write-or-die.h"
 
 #define MAIL_DEFAULT_WRAP 72
@@ -2602,6 +2606,131 @@ static void print_commit(char sign, struct commit *commit, int verbose,
        }
 }
 
+/*
+ * Enumerate blob OIDs from a single commit's diff, inserting them into blobs.
+ * Skips files whose userdiff driver explicitly declares binary status
+ * (drv->binary > 0), since patch-ID uses oid_to_hex() for those and
+ * never reads blob content.  Use userdiff_find_by_path() since
+ * diff_filespec_load_driver() is static in diff.c.
+ *
+ * Clean up with diff_queue_clear() (from diffcore.h).
+ */
+static void collect_diff_blob_oids(struct commit *commit,
+                                  struct diff_options *opts,
+                                  struct oidset *blobs)
+{
+       struct diff_queue_struct *q;
+
+       /*
+        * Merge commits are filtered out by patch_id_defined() in patch-ids.c,
+        * so we'll never be called with one.
+        */
+       assert(!commit->parents || !commit->parents->next);
+
+       if (commit->parents)
+               diff_tree_oid(&commit->parents->item->object.oid,
+                             &commit->object.oid, "", opts);
+       else
+               diff_root_tree_oid(&commit->object.oid, "", opts);
+       diffcore_std(opts);
+
+       q = &diff_queued_diff;
+       for (int i = 0; i < q->nr; i++) {
+               struct diff_filepair *p = q->queue[i];
+               struct userdiff_driver *drv;
+
+               /* Skip binary files */
+               drv = userdiff_find_by_path(opts->repo->index, p->one->path);
+               if (drv && drv->binary > 0)
+                       continue;
+
+               if (DIFF_FILE_VALID(p->one) &&
+                   odb_read_object_info_extended(opts->repo->objects,
+                                                 &p->one->oid, NULL,
+                                                 OBJECT_INFO_FOR_PREFETCH))
+                       oidset_insert(blobs, &p->one->oid);
+               if (DIFF_FILE_VALID(p->two) &&
+                   odb_read_object_info_extended(opts->repo->objects,
+                                                 &p->two->oid, NULL,
+                                                 OBJECT_INFO_FOR_PREFETCH))
+                       oidset_insert(blobs, &p->two->oid);
+       }
+       diff_queue_clear(q);
+}
+
+static int always_match(const void *cmp_data UNUSED,
+                       const struct hashmap_entry *entry1 UNUSED,
+                       const struct hashmap_entry *entry2 UNUSED,
+                       const void *keydata UNUSED)
+{
+       return 0;
+}
+
+/*
+ * Prefetch blobs for git cherry in partial clones.
+ *
+ * Called between the revision walk (which builds the head-side
+ * commit list) and the has_commit_patch_id() comparison loop.
+ *
+ * Uses a cmpfn-swap trick to avoid reading blobs: temporarily
+ * replaces the hashmap's comparison function with a trivial
+ * always-match function, so hashmap_get()/hashmap_get_next() match
+ * any entry with the same oidhash bucket.  These are the set of oids
+ * that would trigger patch_id_neq() during normal lookup and cause
+ * blobs to be read on demand, and we want to prefetch them all at
+ * once instead.
+ */
+static void prefetch_cherry_blobs(struct repository *repo,
+                                 struct commit_list *list,
+                                 struct patch_ids *ids)
+{
+       struct oidset blobs = OIDSET_INIT;
+       hashmap_cmp_fn original_cmpfn;
+
+       /* Exit if we're not in a partial clone */
+       if (!repo_has_promisor_remote(repo))
+               return;
+
+       /* Save original cmpfn, replace with always_match */
+       original_cmpfn = ids->patches.cmpfn;
+       ids->patches.cmpfn = always_match;
+
+       /* Find header-only collisions, gather blobs from those commits */
+       for (struct commit_list *l = list; l; l = l->next) {
+               struct commit *c = l->item;
+               bool match_found = false;
+               for (struct patch_id *cur = patch_id_iter_first(c, ids);
+                    cur;
+                    cur = patch_id_iter_next(cur, ids)) {
+                       match_found = true;
+                       collect_diff_blob_oids(cur->commit, &ids->diffopts,
+                                              &blobs);
+               }
+               if (match_found)
+                       collect_diff_blob_oids(c, &ids->diffopts, &blobs);
+       }
+
+       /* Restore original cmpfn */
+       ids->patches.cmpfn = original_cmpfn;
+
+       /* If we have any blobs to fetch, fetch them */
+       if (oidset_size(&blobs)) {
+               struct oid_array to_fetch = OID_ARRAY_INIT;
+               struct oidset_iter iter;
+               const struct object_id *oid;
+
+               oidset_iter_init(&blobs, &iter);
+               while ((oid = oidset_iter_next(&iter)))
+                       oid_array_append(&to_fetch, oid);
+
+               promisor_remote_get_direct(repo, to_fetch.oid, to_fetch.nr);
+
+               oid_array_clear(&to_fetch);
+       }
+
+       oidset_clear(&blobs);
+}
+
 int cmd_cherry(int argc,
               const char **argv,
               const char *prefix,
@@ -2673,6 +2802,8 @@ int cmd_cherry(int argc,
                commit_list_insert(commit, &list);
        }
 
+       prefetch_cherry_blobs(the_repository, list, &ids);
+
        for (struct commit_list *l = list; l; l = l->next) {
                char sign = '+';
 
index 78c3eac54b599ef9cded9a042f4346621798b645..3e66827d7641d8618d2daced1d76b9f5986a43f3 100755 (executable)
@@ -78,4 +78,31 @@ test_expect_success 'cherry ignores whitespace' '
        test_cmp expect actual
 '
 
+# Reuse the expect file from the previous test, in a partial clone
+test_expect_success 'cherry in partial clone does bulk prefetch' '
+       test_config uploadpack.allowfilter 1 &&
+       test_config uploadpack.allowanysha1inwant 1 &&
+       test_when_finished "rm -rf copy" &&
+
+       git clone --bare --filter=blob:none file://"$(pwd)" copy &&
+       (
+               cd copy &&
+               GIT_TRACE2_EVENT="$(pwd)/trace.output" git cherry upstream-with-space feature-without-space >actual &&
+               test_cmp ../expect actual &&
+
+               grep "child_start.*fetch.negotiationAlgorithm" trace.output >fetches &&
+               test_line_count = 1 fetches &&
+               test_trace2_data promisor fetch_count 4 <trace.output &&
+
+               # A second invocation should not refetch any blobs, since
+               # the prefetch is expected to filter out OIDs that are
+               # already present locally.
+               GIT_TRACE2_EVENT="$(pwd)/trace2.output" git cherry upstream-with-space feature-without-space >actual &&
+               test_cmp ../expect actual &&
+
+               ! grep "child_start.*fetch.negotiationAlgorithm" trace2.output &&
+               ! grep "\"key\":\"fetch_count\"" trace2.output
+       )
+'
+
 test_done