]> git.ipfire.org Git - thirdparty/git.git/commitdiff
contrib/subtree: detect rewritten subtree commits
authorColin Stagner <ask+git@howdoi.land>
Sat, 10 Jan 2026 01:18:11 +0000 (19:18 -0600)
committerJunio C Hamano <gitster@pobox.com>
Sat, 10 Jan 2026 04:21:43 +0000 (20:21 -0800)
    git subtree split --prefix P

detects splits that are outside of path prefix `P` and prunes
them from history graph processing. This improves the performance
of repeated `split --rejoin` with many different prefixes.

Both before and after 83f9dad7d6 (contrib/subtree: fix split with
squashed subtrees, 2025-09-09), the pruning logic does not detect
**rebased** or **cherry-picked** git-subtree commits. If `split`
encounters any of these commits, the split output may have
incomplete history.

All commits authored by

    git subtree merge [--squash] --prefix Q

have a first or second parent that has *only* subtree commits
as ancestors. When splitting a completely different path `P/`,
it is safe to ignore:

1. the merged tree
2. the subtree parent
3. *all* of that parent's ancestry, which applies only to
   path `Q/` and not `P/`.

But this relationship no longer holds if the git-subtree commit
is rebased or otherwise reauthored. After a rebase, the former
git-subtree commit will have other unrelated commits as ancestors.
Ignoring these commits may exclude the history of `P/`,
leading to incomplete `subtree split` output.

The pruning logic relies solely on the `git-subtree-*:` trailers
to detect git-subtree commits, which it blindly accepts without
further validation. The split logic also takes its time about
being wrong: `cmd_split()` execs a `git show` for *every* commit
in the split range… twice. This is inefficient in a shell script.

Add a "reality check" to ignore rebased or rewritten commits:

* Rewrites of non-merge commits cannot be detected, so the new
  detector no longer looks for them.

* Merges carry a `git-subtree-mainline:` trailer with the hash of
  the **first parent**. If this hash differs, or if the "merge"
  commit no longer has multiple parents, a rewrite has occurred.

To increase speed, package this logic in a new method,
`find_other_splits()`. Perform the check up-front by iterating
over a single `git log`. Add ignored subtrees to:

1. the `notree` cache, which excludes them from the `split` history

2. a `prune` negative refs list. The negative refs prevent
   recursing into other subtrees. Since there are potentially a
   *lot* of these, cache them on disk and use rev-list's
   `--stdin` mode.

Reported-by: George <george@mail.dietrich.pub>
Signed-off-by: Colin Stagner <ask+git@howdoi.land>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
contrib/subtree/git-subtree.sh
contrib/subtree/t/t7900-subtree.sh

index 17106d1a721519978ef78b63e9b7763e78813674..3ebe88cbeadb021ef908cf40f05e0e4d023f0138 100755 (executable)
@@ -325,6 +325,12 @@ check_parents () {
        done
 }
 
+# Usage: get_notree REV
+get_notree () {
+       assert test $# = 1
+       test -r "$cachedir/notree/$1"
+}
+
 # Usage: set_notree REV
 set_notree () {
        assert test $# = 1
@@ -511,6 +517,71 @@ find_existing_splits () {
        done || exit $?
 }
 
+# Usage: find_other_splits DIR REV UNREVS...
+#
+# Scan history in REV UNREVS for other `git subtree split --rejoin`
+# merge commits belonging to prefixes outside of DIR. These
+# "other splits" don't contribute to DIR and can be ignored.
+#
+# If any such rejoins are found,
+#
+#   * emit their second-parent as an UNREV, avoiding a
+#     potentially costly history traversal
+#
+#   * mark the merge commit as "notree" to ignore it
+find_other_splits () {
+       assert test $# -ge 2
+       dir="${1%/}"
+       rev="$2"
+       shift 2
+       debug "Looking for other splits with dir != $dir..."
+
+       git log \
+               --grep '^git-subtree-mainline:' \
+               --no-patch \
+               --no-show-signature \
+               --format='hash: %H%nparents: %P%n%(trailers:key=git-subtree-dir,key=git-subtree-mainline,key=git-subtree-split)%nEND' \
+               "$rev" ${@:+"$@"} |
+       while read -r key val
+       do
+               case "$key" in
+               hash:)
+                       commit_hash="${val}"
+                       commit_parents=
+                       subtree_dir=
+                       subtree_mainline=
+                       subtree_split=
+                       ;;
+               parents:)
+                       commit_parents="${val}" ;;
+               git-subtree-dir:)
+                       subtree_dir="${val%/}/" ;;
+               git-subtree-mainline:)
+                       subtree_mainline="${val}" ;;
+               git-subtree-split:)
+                       subtree_split="${val}" ;;
+               END)
+                       # verify:
+                       # * all git-subtree-* trailers are present
+                       # * this subtree is outside of $dir
+                       # * the first parent is the git-subtree-mainline:
+                       # * the commit has at least two parents
+                       if test -n "${subtree_dir}" &&
+                               test -n "${subtree_split}" &&
+                               test -n "${subtree_mainline}" &&
+                               test "${subtree_dir}" = "${subtree_dir#"${dir}/"}" &&
+                               test "${commit_parents}" != "${commit_parents#"$subtree_mainline "}" &&
+                               rev_exists "${commit_hash}^2"
+                       then
+                               debug "find_other_splits excluding dir=$subtree_dir merged in ${commit_hash}"
+                               echo "^${commit_hash}^2"
+                               set_notree "${commit_hash}"
+                       fi
+                       ;;
+               esac
+       done
+}
+
 # Usage: copy_commit REV TREE FLAGS_STR
 copy_commit () {
        assert test $# = 3
@@ -785,42 +856,6 @@ ensure_valid_ref_format () {
                die "fatal: '$1' does not look like a ref"
 }
 
-# Usage: should_ignore_subtree_split_commit REV
-#
-# Check if REV is a commit from another subtree and should be
-# ignored from processing for splits
-should_ignore_subtree_split_commit () {
-       assert test $# = 1
-
-       git show \
-               --no-patch \
-               --no-show-signature \
-               --format='%(trailers:key=git-subtree-dir,key=git-subtree-mainline)' \
-               "$1" |
-       (
-       have_mainline=
-       subtree_dir=
-
-       while read -r trailer val
-       do
-               case "$trailer" in
-               git-subtree-dir:)
-                       subtree_dir="${val%/}" ;;
-               git-subtree-mainline:)
-                       have_mainline=y ;;
-               esac
-       done
-
-       if test -n "${subtree_dir}" &&
-               test -z "${have_mainline}" &&
-               test "${subtree_dir}" != "$arg_prefix"
-       then
-               return 0
-       fi
-       return 1
-       )
-}
-
 # Usage: process_split_commit REV PARENTS
 process_split_commit () {
        assert test $# = 2
@@ -994,31 +1029,39 @@ cmd_split () {
        fi
 
        unrevs="$(find_existing_splits "$dir" "$rev" "$repository")" || exit $?
+       (find_other_splits >"$cachedir/prune" "$dir" "$rev" $unrevs) || exit $?
 
        # We can't restrict rev-list to only $dir here, because some of our
        # parents have the $dir contents the root, and those won't match.
        # (and rev-list --follow doesn't seem to solve this)
-       grl='git rev-list --topo-order --reverse --parents $rev $unrevs'
-       revmax=$(eval "$grl" | wc -l)
+       revmax="$(git rev-list \
+               <"$cachedir/prune" \
+               --topo-order \
+               --reverse \
+               --parents \
+               --stdin \
+               --count \
+               "$rev" \
+               $unrevs
+       )"
        revcount=0
        createcount=0
        extracount=0
-       eval "$grl" |
+       git rev-list \
+               <"$cachedir/prune" \
+               --topo-order \
+               --reverse \
+               --parents \
+               --stdin \
+               "$rev" \
+               $unrevs |
        while read rev parents
        do
-               if should_ignore_subtree_split_commit "$rev"
+               if get_notree "$rev"
                then
                        continue
                fi
-               parsedparents=''
-               for parent in $parents
-               do
-                       if ! should_ignore_subtree_split_commit "$parent"
-                       then
-                               parsedparents="$parsedparents$parent "
-                       fi
-               done
-               process_split_commit "$rev" "$parsedparents"
+               process_split_commit "$rev" "$parents"
        done || exit $?
 
        latest_new=$(cache_get latest_new) || exit $?
index 316dc5269e2b6f47dd01f28e854dd9a68dbde493..4db3a6eff37c4d8dc3bd303172185ed9399b3cc0 100755 (executable)
@@ -411,8 +411,9 @@ test_expect_success 'split sub dir/ with --rejoin' '
                git fetch ./"sub proj" HEAD &&
                git subtree merge --prefix="sub dir" FETCH_HEAD &&
                split_hash=$(git subtree split --prefix="sub dir" --annotate="*") &&
-               git subtree split --prefix="sub dir" --annotate="*" --rejoin &&
-               test "$(last_commit_subject)" = "Split '\''sub dir/'\'' into commit '\''$split_hash'\''"
+               git subtree split --prefix="sub dir" --annotate="*" -b spl --rejoin &&
+               test "$(last_commit_subject)" = "Split '\''sub dir/'\'' into commit '\''$split_hash'\''" &&
+               test "$(git rev-list --count spl)" -eq 5
        )
 '
 
@@ -442,18 +443,25 @@ test_expect_success 'split with multiple subtrees' '
        git -C "$test_count" subtree add --prefix=subADir FETCH_HEAD &&
        git -C "$test_count" fetch ./subB HEAD &&
        git -C "$test_count" subtree add --prefix=subBDir FETCH_HEAD &&
+       test "$(git -C "$test_count" rev-list --count main)" -eq 7 &&
        test_create_commit "$test_count" subADir/main-subA1 &&
        test_create_commit "$test_count" subBDir/main-subB1 &&
        git -C "$test_count" subtree split --prefix=subADir \
-               --squash --rejoin -m "Sub A Split 1" &&
+               --squash --rejoin -m "Sub A Split 1" -b a1 &&
+       test "$(git -C "$test_count" rev-list --count main..a1)" -eq 1 &&
        git -C "$test_count" subtree split --prefix=subBDir \
-               --squash --rejoin -m "Sub B Split 1" &&
+               --squash --rejoin -m "Sub B Split 1" -b b1 &&
+       test "$(git -C "$test_count" rev-list --count main..b1)" -eq 1 &&
        test_create_commit "$test_count" subADir/main-subA2 &&
        test_create_commit "$test_count" subBDir/main-subB2 &&
        git -C "$test_count" subtree split --prefix=subADir \
-               --squash --rejoin -m "Sub A Split 2" &&
+               --squash --rejoin -m "Sub A Split 2" -b a2 &&
+       test "$(git -C "$test_count" rev-list --count main..a2)" -eq 2 &&
+       test "$(git -C "$test_count" rev-list --count a1..a2)" -eq 1 &&
        test "$(git -C "$test_count" subtree split --prefix=subBDir \
-               --squash --rejoin -d -m "Sub B Split 1" 2>&1 | grep -w "\[1\]")" = ""
+               --squash --rejoin -d -m "Sub B Split 1" -b b2 2>&1 | grep -w "\[1\]")" = "" &&
+       test "$(git -C "$test_count" rev-list --count main..b2)" -eq 2 &&
+       test "$(git -C "$test_count" rev-list --count b1..b2)" -eq 1
 '
 
 # When subtree split-ing a directory that has other subtree
@@ -477,6 +485,7 @@ do
                        test_path_is_file subA/file1.t &&
                        test_path_is_file subA/subB/file2.t &&
                        git subtree split --prefix=subA --branch=bsplit &&
+                       test "$(git rev-list --count bsplit)" -eq 2 &&
                        git checkout bsplit &&
                        test_path_is_file file1.t &&
                        test_path_is_file subB/file2.t &&
@@ -489,6 +498,7 @@ do
                                --prefix=subA/subB mksubtree &&
                        test_path_is_file subA/subB/file3.t &&
                        git subtree split --prefix=subA --branch=bsplit &&
+                       test "$(git rev-list --count bsplit)" -eq 3 &&
                        git checkout bsplit &&
                        test_path_is_file file1.t &&
                        test_path_is_file subB/file2.t &&
@@ -497,6 +507,67 @@ do
        '
 done
 
+# Usually,
+#
+#    git subtree merge -P subA --squash f00...
+#
+# makes two commits, in this order:
+#
+# 1. Squashed 'subA/' content from commit f00...
+# 2. Merge commit (1) as 'subA'
+#
+# Commit 1 updates the subtree but does *not* rewrite paths.
+# Commit 2 rewrites all trees to start with `subA/`
+#
+# Commit 1 either has no parents or depends only on other
+# "Squashed 'subA/' content" commits.
+#
+# For merge without --squash, subtree produces just one commit:
+# a merge commit with git-subtree trailers.
+#
+# In either case, if the user rebases these commits, they will
+# still have the git-subtree-* trailers… but will NOT have
+# the layout described above.
+#
+# Test that subsequent `git subtree split` are not confused by this.
+test_expect_success 'split with rebased subtree commit' '
+       subtree_test_create_repo "$test_count" &&
+       (
+               cd "$test_count" &&
+               test_commit file0 &&
+               test_create_subtree_add \
+                       . mksubtree subA file1 --squash &&
+               test_path_is_file subA/file1.t &&
+               mkdir subB &&
+               test_commit subB/bfile &&
+               git commit --amend -F - <<'EOF' &&
+Squashed '\''subB/'\'' content from commit '\''badf00da911bbe895347b4b236f5461d55dc9877'\''
+
+Simulate a cherry-picked or rebased subtree commit.
+
+git-subtree-dir: subB
+git-subtree-split: badf00da911bbe895347b4b236f5461d55dc9877
+EOF
+               test_commit subA/file2 &&
+               test_commit subB/bfile2 &&
+               git commit --amend -F - <<'EOF' &&
+Split '\''subB/'\'' into commit '\''badf00da911bbe895347b4b236f5461d55dc9877'\''
+
+Simulate a cherry-picked or rebased subtree commit.
+
+git-subtree-dir: subB
+git-subtree-mainline: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+git-subtree-split: badf00da911bbe895347b4b236f5461d55dc9877
+EOF
+               git subtree split --prefix=subA --branch=bsplit &&
+               git checkout bsplit &&
+               test_path_is_file file1.t &&
+               test_path_is_file file2.t &&
+               test "$(last_commit_subject)" = "subA/file2" &&
+               test "$(git rev-list --count bsplit)" -eq 2
+       )
+'
+
 test_expect_success 'split sub dir/ with --rejoin from scratch' '
        subtree_test_create_repo "$test_count" &&
        test_create_commit "$test_count" main1 &&