]> git.ipfire.org Git - thirdparty/git.git/commitdiff
repack: add --path-walk option
authorDerrick Stolee <stolee@gmail.com>
Fri, 16 May 2025 18:11:57 +0000 (18:11 +0000)
committerJunio C Hamano <gitster@pobox.com>
Fri, 16 May 2025 19:15:39 +0000 (12:15 -0700)
Since 'git pack-objects' supports a --path-walk option, allow passing it
through in 'git repack'. This presents interesting testing opportunities for
comparing the different repacking strategies against each other.

Add the --path-walk option to the performance tests in p5313.

For the microsoft/fluentui repo [1] checked out at a specific commit [2],
the --path-walk tests in p5313 look like this:

Test                                                     this tree
-------------------------------------------------------------------------
5313.18: thin pack with --path-walk                      0.08(0.06+0.02)
5313.19: thin pack size with --path-walk                           18.4K
5313.20: big pack with --path-walk                       2.10(7.80+0.26)
5313.21: big pack size with --path-walk                            19.8M
5313.22: shallow fetch pack with --path-walk             1.62(3.38+0.17)
5313.23: shallow pack size with --path-walk                        33.6M
5313.24: repack with --path-walk                         81.29(96.08+0.71)
5313.25: repack size with --path-walk                             142.5M

[1] https://github.com/microsoft/fluentui
[2] e70848ebac1cd720875bccaa3026f4a9ed700e08

Along with the earlier tests in p5313, I'll instead reformat the
comparison as follows:

Repack Method    Pack Size       Time
---------------------------------------
Hash v1             439.4M      87.24s
Hash v2             161.7M      21.51s
Path Walk           142.5M      81.29s

There are a few things to notice here:

 1. The benefits of --name-hash-version=2 over --name-hash-version=1 are
    significant, but --path-walk still compresses better than that
    option.

 2. The --path-walk command is still using --name-hash-version=1 for the
    second pass of delta computation, using the increased name hash
    collisions as a potential method for opportunistic compression on
    top of the path-focused compression.

 3. The --path-walk algorithm is currently sequential and does not use
    multiple threads for delta compression. Threading will be
    implemented in a future change so the computation time will improve
    to better compete in this metric.

There are small benefits in size for my copy of the Git repository:

Repack Method    Pack Size       Time
---------------------------------------
Hash v1             248.8M      30.44s
Hash v2             249.0M      30.15s
Path Walk           213.2M     142.50s

As well as in the nodejs/node repository [3]:

Repack Method    Pack Size       Time
---------------------------------------
Hash v1             739.9M      71.18s
Hash v2             764.6M      67.82s
Path Walk           698.1M     208.10s

[3] https://github.com/nodejs/node

This benefit also repeats in my copy of the Linux kernel repository:

Repack Method    Pack Size       Time
---------------------------------------
Hash v1               2.5G     554.41s
Hash v2               2.5G     549.62s
Path Walk             2.2G    1562.36s

It is important to see that even when the repository shape does not have
many name-hash collisions, there is a slight space boost to be found
using this method.

As this repacking strategy was released in Git for Windows 2.47.0, some
users have reported cases where the --path-walk compression is slightly
worse than the --name-hash-version=2 option. In those cases, it may be
beneficial to combine the two options. However, there has not been a
released version of Git that has both options and I don't have access to
these repos for testing.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Documentation/git-repack.adoc
builtin/repack.c
t/perf/p5313-pack-objects.sh

index 5852a5c9736875889a71a3820e2d98e3f4826b2f..aa1bc081e50aa05e713274d38f6d6442b04071ee 100644 (file)
@@ -11,7 +11,7 @@ SYNOPSIS
 [verse]
 'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]
        [--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
-       [--write-midx] [--name-hash-version=<n>]
+       [--write-midx] [--name-hash-version=<n>] [--path-walk]
 
 DESCRIPTION
 -----------
@@ -255,6 +255,9 @@ linkgit:git-multi-pack-index[1]).
        Provide this argument to the underlying `git pack-objects` process.
        See linkgit:git-pack-objects[1] for full details.
 
+--path-walk::
+       Pass the `--path-walk` option to the underlying `git pack-objects`
+       process. See linkgit:git-pack-objects[1] for full details.
 
 CONFIGURATION
 -------------
index 75e3752353a27fdc881490158584e8dbfe0d627e..d7f798280c0e426489d51a6e66287977f50d5419 100644 (file)
@@ -43,7 +43,7 @@ static char *packdir, *packtmp_name, *packtmp;
 static const char *const git_repack_usage[] = {
        N_("git repack [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]\n"
           "[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]\n"
-          "[--write-midx] [--name-hash-version=<n>]"),
+          "[--write-midx] [--name-hash-version=<n>] [--path-walk]"),
        NULL
 };
 
@@ -63,6 +63,7 @@ struct pack_objects_args {
        int quiet;
        int local;
        int name_hash_version;
+       int path_walk;
        struct list_objects_filter_options filter_options;
 };
 
@@ -313,6 +314,8 @@ static void prepare_pack_objects(struct child_process *cmd,
                strvec_pushf(&cmd->args, "--no-reuse-object");
        if (args->name_hash_version)
                strvec_pushf(&cmd->args, "--name-hash-version=%d", args->name_hash_version);
+       if (args->path_walk)
+               strvec_pushf(&cmd->args, "--path-walk");
        if (args->local)
                strvec_push(&cmd->args,  "--local");
        if (args->quiet)
@@ -1212,6 +1215,8 @@ int cmd_repack(int argc,
                                N_("pass --no-reuse-object to git-pack-objects")),
                OPT_INTEGER(0, "name-hash-version", &po_args.name_hash_version,
                                N_("specify the name hash version to use for grouping similar objects by path")),
+               OPT_BOOL(0, "path-walk", &po_args.path_walk,
+                               N_("pass --path-walk to git-pack-objects")),
                OPT_NEGBIT('n', NULL, &run_update_server_info,
                                N_("do not run git-update-server-info"), 1),
                OPT__QUIET(&po_args.quiet, N_("be quiet")),
index cd6dd3abb710345a558c40e7e08e64fbdfd365b7..98748b0e203a686738aa204c8be9bd8c394f3f98 100755 (executable)
@@ -55,23 +55,21 @@ test_all_with_args () {
        test_size "shallow pack size with $parameter" '
                test_file_size out
        '
-}
-
-for version in 1 2
-do
-       export version
-
-       test_all_with_args --name-hash-version=$version
 
-       test_perf "repack with --name-hash-version=$version" '
-               git repack -adf --name-hash-version=$version
+       test_perf "repack with $parameter" '
+               git repack -adf $parameter
        '
 
-       test_size "repack size with --name-hash-version=$version" '
+       test_size "repack size with $parameter" '
                gitdir=$(git rev-parse --git-dir) &&
                pack=$(ls $gitdir/objects/pack/pack-*.pack) &&
                test_file_size "$pack"
        '
+}
+
+for version in 1 2
+do
+       test_all_with_args --name-hash-version=$version
 done
 
 test_all_with_args --path-walk