]> git.ipfire.org Git - thirdparty/git.git/commitdiff
pack-objects: add --path-walk option
authorDerrick Stolee <stolee@gmail.com>
Fri, 16 May 2025 18:11:52 +0000 (18:11 +0000)
committerJunio C Hamano <gitster@pobox.com>
Fri, 16 May 2025 19:15:38 +0000 (12:15 -0700)
In order to more easily compute delta bases among objects that appear at
the exact same path, add a --path-walk option to 'git pack-objects'.

This option will use the path-walk API instead of the object walk given
by the revision machinery. Since objects will be provided in batches
representing a common path, those objects can be tested for delta bases
immediately instead of waiting for a sort of the full object list by
name-hash. This has multiple benefits, including avoiding collisions by
name-hash.

The objects marked as UNINTERESTING are included in these batches, so we
are guaranteeing some locality to find good delta bases.

After the individual passes are done on a per-path basis, the default
name-hash is used to find other opportunistic delta bases that did not
match exactly by the full path name.

The current implementation performs delta calculations while walking
objects, which is not ideal for a few reasons. First, this will cause
the "Enumerating objects" phase to be much longer than usual. Second, it
does not take advantage of threading during the path-scoped delta
calculations. Even with this lack of threading, the path-walk option is
sometimes faster than the usual approach. Future changes will refactor
this code to allow for threading, but that complexity is deferred until
later to keep this patch as simple as possible.

This new walk is incompatible with some features and is ignored by
others:

 * Object filters are not currently integrated with the path-walk API,
   such as sparse-checkout or tree depth. A blobless packfile could be
   integrated easily, but that is deferred for later.

 * Server-focused features such as delta islands, shallow packs, and
   using a bitmap index are incompatible with the path-walk API.

 * The path walk API is only compatible with the --revs option, not
   taking object lists or pack lists over stdin. These alternative ways
   to specify the objects currently ignores the --path-walk option
   without even a warning.

Future changes will create performance tests that demonstrate the power
of this approach.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Documentation/git-pack-objects.adoc
Documentation/technical/api-path-walk.adoc
builtin/pack-objects.c
t/t5300-pack-object.sh

index 7f69ae4855f6cee515629ae5d5c372bb416b6790..3b803d3a7830302cb8ad299dc1606940e64eb2f3 100644 (file)
@@ -16,7 +16,7 @@ SYNOPSIS
        [--cruft] [--cruft-expiration=<time>]
        [--stdout [--filter=<filter-spec>] | <base-name>]
        [--shallow] [--keep-true-parents] [--[no-]sparse]
-       [--name-hash-version=<n>] < <object-list>
+       [--name-hash-version=<n>] [--path-walk] < <object-list>
 
 
 DESCRIPTION
@@ -375,6 +375,17 @@ many different directories. At the moment, this version is not allowed
 when writing reachability bitmap files with `--write-bitmap-index` and it
 will be automatically changed to version `1`.
 
+--path-walk::
+       Perform compression by first organizing objects by path, then a
+       second pass that compresses across paths as normal. This has the
+       potential to improve delta compression especially in the presence
+       of filenames that cause collisions in Git's default name-hash
+       algorithm.
++
+Incompatible with `--delta-islands`, `--shallow`, or `--filter`. The
+`--use-bitmap-index` option will be ignored in the presence of
+`--path-walk.`
+
 
 DELTA ISLANDS
 -------------
index 3e089211fb4d6967dca9866969d91b2d8e4e522a..e522695dd9fafec8bff0c8a4345745962776e87c 100644 (file)
@@ -69,4 +69,5 @@ Examples
 
 See example usages in:
        `t/helper/test-path-walk.c`,
+       `builtin/pack-objects.c`,
        `builtin/backfill.c`
index 7805429f5d1cb008c865848d5bc2259f63565467..bd088389037652a573ee7a15647f352c802f3ac5 100644 (file)
@@ -41,6 +41,9 @@
 #include "promisor-remote.h"
 #include "pack-mtimes.h"
 #include "parse-options.h"
+#include "blob.h"
+#include "tree.h"
+#include "path-walk.h"
 
 /*
  * Objects we are going to pack are collected in the `to_pack` structure.
@@ -217,6 +220,7 @@ static int delta_search_threads;
 static int pack_to_stdout;
 static int sparse;
 static int thin;
+static int path_walk;
 static int num_preferred_base;
 static struct progress *progress_state;
 
@@ -4191,6 +4195,105 @@ static void mark_bitmap_preferred_tips(void)
        }
 }
 
+static inline int is_oid_uninteresting(struct repository *repo,
+                                      struct object_id *oid)
+{
+       struct object *o = lookup_object(repo, oid);
+       return !o || (o->flags & UNINTERESTING);
+}
+
+static int add_objects_by_path(const char *path,
+                              struct oid_array *oids,
+                              enum object_type type,
+                              void *data)
+{
+       struct object_entry **delta_list = NULL;
+       size_t oe_start = to_pack.nr_objects;
+       size_t oe_end;
+       unsigned int sub_list_nr;
+       unsigned int *processed = data;
+
+       /*
+        * First, add all objects to the packing data, including the ones
+        * marked UNINTERESTING (translated to 'exclude') as they can be
+        * used as delta bases.
+        */
+       for (size_t i = 0; i < oids->nr; i++) {
+               int exclude;
+               struct object_info oi = OBJECT_INFO_INIT;
+               struct object_id *oid = &oids->oid[i];
+
+               /* Skip objects that do not exist locally. */
+               if (exclude_promisor_objects &&
+                   oid_object_info_extended(the_repository, oid, &oi,
+                                            OBJECT_INFO_FOR_PREFETCH) < 0)
+                       continue;
+
+               exclude = is_oid_uninteresting(the_repository, oid);
+
+               if (exclude && !thin)
+                       continue;
+
+               add_object_entry(oid, type, path, exclude);
+       }
+
+       oe_end = to_pack.nr_objects;
+
+       /* We can skip delta calculations if it is a no-op. */
+       if (oe_end == oe_start || !window)
+               return 0;
+
+       sub_list_nr = 0;
+       if (oe_end > oe_start)
+               ALLOC_ARRAY(delta_list, oe_end - oe_start);
+
+       for (size_t i = 0; i < oe_end - oe_start; i++) {
+               struct object_entry *entry = to_pack.objects + oe_start + i;
+
+               if (!should_attempt_deltas(entry))
+                       continue;
+
+               delta_list[sub_list_nr++] = entry;
+       }
+
+       /*
+        * Find delta bases among this list of objects that all match the same
+        * path. This causes the delta compression to be interleaved in the
+        * object walk, which can lead to confusing progress indicators. This is
+        * also incompatible with threaded delta calculations. In the future,
+        * consider creating a list of regions in the full to_pack.objects array
+        * that could be picked up by the threaded delta computation.
+        */
+       if (sub_list_nr && window) {
+               QSORT(delta_list, sub_list_nr, type_size_sort);
+               find_deltas(delta_list, &sub_list_nr, window, depth, processed);
+       }
+
+       free(delta_list);
+       return 0;
+}
+
+static void get_object_list_path_walk(struct rev_info *revs)
+{
+       struct path_walk_info info = PATH_WALK_INFO_INIT;
+       unsigned int processed = 0;
+
+       info.revs = revs;
+       info.path_fn = add_objects_by_path;
+       info.path_fn_data = &processed;
+
+       /*
+        * Allow the --[no-]sparse option to be interesting here, if only
+        * for testing purposes. Paths with no interesting objects will not
+        * contribute to the resulting pack, but only create noisy preferred
+        * base objects.
+        */
+       info.prune_all_uninteresting = sparse;
+
+       if (walk_objects_by_path(&info))
+               die(_("failed to pack objects via path-walk"));
+}
+
 static void get_object_list(struct rev_info *revs, int ac, const char **av)
 {
        struct setup_revision_opt s_r_opt = {
@@ -4246,15 +4349,19 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av)
        if (write_bitmap_index)
                mark_bitmap_preferred_tips();
 
-       if (prepare_revision_walk(revs))
-               die(_("revision walk setup failed"));
-       mark_edges_uninteresting(revs, show_edge, sparse);
-
        if (!fn_show_object)
                fn_show_object = show_object;
-       traverse_commit_list(revs,
-                            show_commit, fn_show_object,
-                            NULL);
+
+       if (path_walk) {
+               get_object_list_path_walk(revs);
+       } else {
+               if (prepare_revision_walk(revs))
+                       die(_("revision walk setup failed"));
+               mark_edges_uninteresting(revs, show_edge, sparse);
+               traverse_commit_list(revs,
+                               show_commit, fn_show_object,
+                               NULL);
+       }
 
        if (unpack_unreachable_expiration) {
                revs->ignore_missing_links = 1;
@@ -4464,6 +4571,8 @@ int cmd_pack_objects(int argc,
                         N_("use the sparse reachability algorithm")),
                OPT_BOOL(0, "thin", &thin,
                         N_("create thin packs")),
+               OPT_BOOL(0, "path-walk", &path_walk,
+                        N_("use the path-walk API to walk objects when possible")),
                OPT_BOOL(0, "shallow", &shallow,
                         N_("create packs suitable for shallow fetches")),
                OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep_on_disk,
@@ -4549,7 +4658,30 @@ int cmd_pack_objects(int argc,
                window = 0;
 
        strvec_push(&rp, "pack-objects");
-       if (thin) {
+
+       if (path_walk) {
+               const char *option = NULL;
+               if (filter_options.choice)
+                       option = "--filter";
+               else if (use_delta_islands)
+                       option = "--delta-islands";
+               else if (shallow)
+                       option = "--shallow";
+
+               if (option) {
+                       warning(_("cannot use %s with %s"),
+                               option, "--path-walk");
+                       path_walk = 0;
+               }
+       }
+       if (path_walk) {
+               strvec_push(&rp, "--boundary");
+                /*
+                 * We must disable the bitmaps because we are removing
+                 * the --objects / --objects-edge[-aggressive] options.
+                 */
+               use_bitmap_index = 0;
+       } else if (thin) {
                use_internal_rev_list = 1;
                strvec_push(&rp, shallow
                                ? "--objects-edge-aggressive"
index 5ac8d39094b6537e94e6bc6c0f49953e82e69f46..16420d1286398a5a90e185eb7b4a2e41ff388f55 100755 (executable)
@@ -723,4 +723,19 @@ test_expect_success '--name-hash-version=2 and --write-bitmap-index are incompat
        ! test_grep "currently, --write-bitmap-index requires --name-hash-version=1" err
 '
 
+test_expect_success '--path-walk pack everything' '
+       git -C server rev-parse HEAD >in &&
+       git -C server pack-objects --stdout --revs --path-walk <in >out.pack &&
+       git -C server index-pack --stdin <out.pack
+'
+
+test_expect_success '--path-walk thin pack' '
+       cat >in <<-EOF &&
+       $(git -C server rev-parse HEAD)
+       ^$(git -C server rev-parse HEAD~2)
+       EOF
+       git -C server pack-objects --thin --stdout --revs --path-walk <in >out.pack &&
+       git -C server index-pack --fix-thin --stdin <out.pack
+'
+
 test_done