]> git.ipfire.org Git - thirdparty/git.git/commitdiff
diffcore-rename: provide basic implementation of idx_possible_rename()
authorElijah Newren <newren@gmail.com>
Sat, 27 Feb 2021 00:30:40 +0000 (00:30 +0000)
committerJunio C Hamano <gitster@pobox.com>
Sat, 27 Feb 2021 01:53:11 +0000 (17:53 -0800)
Add a new struct dir_rename_info with various values we need inside our
idx_possible_rename() function introduced in the previous commit.  Add a
basic implementation for this function showing how we plan to use the
variables, but which will just return early with a value of -1 (not
found) when those variables are not set up.

Future commits will do the work necessary to set up those other
variables so that idx_possible_rename() does not always return -1.

Reviewed-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
diffcore-rename.c

index b3055683bac2d616ae01f617211e4f16719ff42b..edb0effb6ef4f34a4cb570556d67861206f496b8 100644 (file)
@@ -367,6 +367,19 @@ static int find_exact_renames(struct diff_options *options)
        return renames;
 }
 
+struct dir_rename_info {
+       struct strintmap idx_map;
+       struct strmap dir_rename_guess;
+       struct strmap *dir_rename_count;
+       unsigned setup;
+};
+
+static char *get_dirname(const char *filename)
+{
+       char *slash = strrchr(filename, '/');
+       return slash ? xstrndup(filename, slash - filename) : xstrdup("");
+}
+
 static const char *get_basename(const char *filename)
 {
        /*
@@ -379,14 +392,86 @@ static const char *get_basename(const char *filename)
        return base ? base + 1 : filename;
 }
 
-static int idx_possible_rename(char *filename)
+static int idx_possible_rename(char *filename, struct dir_rename_info *info)
 {
-       /* Unconditionally return -1, "not found", for now */
-       return -1;
+       /*
+        * Our comparison of files with the same basename (see
+        * find_basename_matches() below), is only helpful when after exact
+        * rename detection we have exactly one file with a given basename
+        * among the rename sources and also only exactly one file with
+        * that basename among the rename destinations.  When we have
+        * multiple files with the same basename in either set, we do not
+        * know which to compare against.  However, there are some
+        * filenames that occur in large numbers (particularly
+        * build-related filenames such as 'Makefile', '.gitignore', or
+        * 'build.gradle' that potentially exist within every single
+        * subdirectory), and for performance we want to be able to quickly
+        * find renames for these files too.
+        *
+        * The reason basename comparisons are a useful heuristic was that it
+        * is common for people to move files across directories while keeping
+        * their filename the same.  If we had a way of determining or even
+        * making a good educated guess about which directory these non-unique
+        * basename files had moved the file to, we could check it.
+        * Luckily...
+        *
+        * When an entire directory is in fact renamed, we have two factors
+        * helping us out:
+        *   (a) the original directory disappeared giving us a hint
+        *       about when we can apply an extra heuristic.
+        *   (a) we often have several files within that directory and
+        *       subdirectories that are renamed without changes
+        * So, rules for a heuristic:
+        *   (0) If there basename matches are non-unique (the condition under
+        *       which this function is called) AND
+        *   (1) the directory in which the file was found has disappeared
+        *       (i.e. dirs_removed is non-NULL and has a relevant entry) THEN
+        *   (2) use exact renames of files within the directory to determine
+        *       where the directory is likely to have been renamed to.  IF
+        *       there is at least one exact rename from within that
+        *       directory, we can proceed.
+        *   (3) If there are multiple places the directory could have been
+        *       renamed to based on exact renames, ignore all but one of them.
+        *       Just use the destination with the most renames going to it.
+        *   (4) Check if applying that directory rename to the original file
+        *       would result in a destination filename that is in the
+        *       potential rename set.  If so, return the index of the
+        *       destination file (the index within rename_dst).
+        *   (5) Compare the original file and returned destination for
+        *       similarity, and if they are sufficiently similar, record the
+        *       rename.
+        *
+        * This function, idx_possible_rename(), is only responsible for (4).
+        * The conditions/steps in (1)-(3) will be handled via setting up
+        * dir_rename_count and dir_rename_guess in a future
+        * initialize_dir_rename_info() function.  Steps (0) and (5) are
+        * handled by the caller of this function.
+        */
+       char *old_dir, *new_dir;
+       struct strbuf new_path = STRBUF_INIT;
+       int idx;
+
+       if (!info->setup)
+               return -1;
+
+       old_dir = get_dirname(filename);
+       new_dir = strmap_get(&info->dir_rename_guess, old_dir);
+       free(old_dir);
+       if (!new_dir)
+               return -1;
+
+       strbuf_addstr(&new_path, new_dir);
+       strbuf_addch(&new_path, '/');
+       strbuf_addstr(&new_path, get_basename(filename));
+
+       idx = strintmap_get(&info->idx_map, new_path.buf);
+       strbuf_release(&new_path);
+       return idx;
 }
 
 static int find_basename_matches(struct diff_options *options,
-                                int minimum_score)
+                                int minimum_score,
+                                struct dir_rename_info *info)
 {
        /*
         * When I checked in early 2020, over 76% of file renames in linux
@@ -494,7 +579,7 @@ static int find_basename_matches(struct diff_options *options,
                        dst_index = strintmap_get(&dests, base);
                        if (src_index == -1 || dst_index == -1) {
                                src_index = i;
-                               dst_index = idx_possible_rename(filename);
+                               dst_index = idx_possible_rename(filename, info);
                        }
                        if (dst_index == -1)
                                continue;
@@ -677,8 +762,10 @@ void diffcore_rename(struct diff_options *options)
        int num_destinations, dst_cnt;
        int num_sources, want_copies;
        struct progress *progress = NULL;
+       struct dir_rename_info info;
 
        trace2_region_enter("diff", "setup", options->repo);
+       info.setup = 0;
        want_copies = (detect_rename == DIFF_DETECT_COPY);
        if (!minimum_score)
                minimum_score = DEFAULT_RENAME_SCORE;
@@ -774,7 +861,8 @@ void diffcore_rename(struct diff_options *options)
                /* Utilize file basenames to quickly find renames. */
                trace2_region_enter("diff", "basename matches", options->repo);
                rename_count += find_basename_matches(options,
-                                                     min_basename_score);
+                                                     min_basename_score,
+                                                     &info);
                trace2_region_leave("diff", "basename matches", options->repo);
 
                /*