struct strintmap idx_map;
struct strmap dir_rename_guess;
struct strmap *dir_rename_count;
+ struct strset *relevant_source_dirs;
unsigned setup;
};
*slash = '\0';
}
-static void increment_count(struct strmap *dir_rename_count,
+static const char *get_highest_rename_path(struct strintmap *counts)
+{
+ int highest_count = 0;
+ const char *highest_destination_dir = NULL;
+ struct hashmap_iter iter;
+ struct strmap_entry *entry;
+
+ strintmap_for_each_entry(counts, &iter, entry) {
+ const char *destination_dir = entry->key;
+ intptr_t count = (intptr_t)entry->value;
+ if (count > highest_count) {
+ highest_count = count;
+ highest_destination_dir = destination_dir;
+ }
+ }
+ return highest_destination_dir;
+}
+
+static void increment_count(struct dir_rename_info *info,
char *old_dir,
char *new_dir)
{
struct strmap_entry *e;
/* Get the {new_dirs -> counts} mapping using old_dir */
- e = strmap_get_entry(dir_rename_count, old_dir);
+ e = strmap_get_entry(info->dir_rename_count, old_dir);
if (e) {
counts = e->value;
} else {
counts = xmalloc(sizeof(*counts));
strintmap_init_with_options(counts, 0, NULL, 1);
- strmap_put(dir_rename_count, old_dir, counts);
+ strmap_put(info->dir_rename_count, old_dir, counts);
}
/* Increment the count for new_dir */
strintmap_incr(counts, new_dir, 1);
}
-static void update_dir_rename_counts(struct strmap *dir_rename_count,
+static void update_dir_rename_counts(struct dir_rename_info *info,
struct strset *dirs_removed,
const char *oldname,
const char *newname)
char new_dir_first_char = new_dir[0];
int first_time_in_loop = 1;
+ if (!info->setup)
+ /*
+ * info->setup is 0 here in two cases: (1) all auxiliary
+ * vars (like dirs_removed) were NULL so
+ * initialize_dir_rename_info() returned early, or (2)
+ * either break detection or copy detection are active so
+ * that we never called initialize_dir_rename_info(). In
+ * the former case, we don't have enough info to know if
+ * directories were renamed (because dirs_removed lets us
+ * know about a necessary prerequisite, namely if they were
+ * removed), and in the latter, we don't care about
+ * directory renames or find_basename_matches.
+ *
+ * This matters because both basename and inexact matching
+ * will also call update_dir_rename_counts(). In either of
+ * the above two cases info->dir_rename_counts will not
+ * have been properly initialized which prevents us from
+ * updating it, but in these two cases we don't care about
+ * dir_rename_counts anyway, so we can just exit early.
+ */
+ return;
+
while (1) {
+ /* Get old_dir, skip if its directory isn't relevant. */
dirname_munge(old_dir);
+ if (info->relevant_source_dirs &&
+ !strset_contains(info->relevant_source_dirs, old_dir))
+ break;
+
+ /* Get new_dir */
dirname_munge(new_dir);
/*
}
if (strset_contains(dirs_removed, old_dir))
- increment_count(dir_rename_count, old_dir, new_dir);
+ increment_count(info, old_dir, new_dir);
else
break;
free(new_dir);
}
-static void compute_dir_rename_counts(struct strmap *dir_rename_count,
- struct strset *dirs_removed)
+static void initialize_dir_rename_info(struct dir_rename_info *info,
+ struct strset *dirs_removed,
+ struct strmap *dir_rename_count)
{
+ struct hashmap_iter iter;
+ struct strmap_entry *entry;
int i;
- /* Set up dir_rename_count */
- for (i = 0; i < rename_dst_nr; ++i) {
- /* File not part of directory rename counts if not a rename */
- if (!rename_dst[i].is_rename)
- continue;
-
- /*
- * Make dir_rename_count contain a map of a map:
- * old_directory -> {new_directory -> count}
- * In other words, for every pair look at the directories for
- * the old filename and the new filename and count how many
- * times that pairing occurs.
- */
- update_dir_rename_counts(dir_rename_count, dirs_removed,
- rename_dst[i].p->one->path,
- rename_dst[i].p->two->path);
+ if (!dirs_removed) {
+ info->setup = 0;
+ return;
}
-}
-
-static void initialize_dir_rename_info(struct dir_rename_info *info)
-{
- int i;
-
info->setup = 1;
+ info->dir_rename_count = dir_rename_count;
+ if (!info->dir_rename_count) {
+ info->dir_rename_count = xmalloc(sizeof(*dir_rename_count));
+ strmap_init(info->dir_rename_count);
+ }
strintmap_init_with_options(&info->idx_map, -1, NULL, 0);
strmap_init_with_options(&info->dir_rename_guess, NULL, 0);
- info->dir_rename_count = NULL;
+
+ /* Setup info->relevant_source_dirs */
+ info->relevant_source_dirs = dirs_removed;
/*
- * Loop setting up both info->idx_map.
+ * Loop setting up both info->idx_map, and doing setup of
+ * info->dir_rename_count.
*/
for (i = 0; i < rename_dst_nr; ++i) {
/*
if (!rename_dst[i].is_rename) {
char *filename = rename_dst[i].p->two->path;
strintmap_set(&info->idx_map, filename, i);
+ continue;
}
+
+ /*
+ * For everything else (i.e. renamed files), make
+ * dir_rename_count contain a map of a map:
+ * old_directory -> {new_directory -> count}
+ * In other words, for every pair look at the directories for
+ * the old filename and the new filename and count how many
+ * times that pairing occurs.
+ */
+ update_dir_rename_counts(info, dirs_removed,
+ rename_dst[i].p->one->path,
+ rename_dst[i].p->two->path);
+ }
+
+ /*
+ * Now we collapse
+ * dir_rename_count: old_directory -> {new_directory -> count}
+ * down to
+ * dir_rename_guess: old_directory -> best_new_directory
+ * where best_new_directory is the one with the highest count.
+ */
+ strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
+ /* entry->key is source_dir */
+ struct strintmap *counts = entry->value;
+ char *best_newdir;
+
+ best_newdir = xstrdup(get_highest_rename_path(counts));
+ strmap_put(&info->dir_rename_guess, entry->key,
+ best_newdir);
+ }
+}
+
+void partial_clear_dir_rename_count(struct strmap *dir_rename_count)
+{
+ struct hashmap_iter iter;
+ struct strmap_entry *entry;
+
+ strmap_for_each_entry(dir_rename_count, &iter, entry) {
+ struct strintmap *counts = entry->value;
+ strintmap_clear(counts);
}
+ strmap_partial_clear(dir_rename_count, 1);
}
-static void cleanup_dir_rename_info(struct dir_rename_info *info)
+static void cleanup_dir_rename_info(struct dir_rename_info *info,
+ struct strset *dirs_removed,
+ int keep_dir_rename_count)
{
+ struct hashmap_iter iter;
+ struct strmap_entry *entry;
+ struct string_list to_remove = STRING_LIST_INIT_NODUP;
+ int i;
+
if (!info->setup)
return;
/* dir_rename_guess */
strmap_clear(&info->dir_rename_guess, 1);
- /* Nothing to do for dir_rename_count, yet */
+ /* dir_rename_count */
+ if (!keep_dir_rename_count) {
+ partial_clear_dir_rename_count(info->dir_rename_count);
+ strmap_clear(info->dir_rename_count, 1);
+ FREE_AND_NULL(info->dir_rename_count);
+ return;
+ }
+
+ /*
+ * Although dir_rename_count was passed in
+ * diffcore_rename_extended() and we want to keep it around and
+ * return it to that caller, we first want to remove any data
+ * associated with directories that weren't renamed.
+ */
+ strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
+ const char *source_dir = entry->key;
+ struct strintmap *counts = entry->value;
+
+ if (!strset_contains(dirs_removed, source_dir)) {
+ string_list_append(&to_remove, source_dir);
+ strintmap_clear(counts);
+ continue;
+ }
+ }
+ for (i = 0; i < to_remove.nr; ++i)
+ strmap_remove(info->dir_rename_count,
+ to_remove.items[i].string, 1);
+ string_list_clear(&to_remove, 0);
}
static const char *get_basename(const char *filename)
* rename.
*
* This function, idx_possible_rename(), is only responsible for (4).
- * The conditions/steps in (1)-(3) will be handled via setting up
- * dir_rename_count and dir_rename_guess in a future
- * initialize_dir_rename_info() function. Steps (0) and (5) are
- * handled by the caller of this function.
+ * The conditions/steps in (1)-(3) are handled via setting up
+ * dir_rename_count and dir_rename_guess in
+ * initialize_dir_rename_info(). Steps (0) and (5) are handled by
+ * the caller of this function.
*/
char *old_dir, *new_dir;
struct strbuf new_path = STRBUF_INIT;
static int find_basename_matches(struct diff_options *options,
int minimum_score,
- struct dir_rename_info *info)
+ struct dir_rename_info *info,
+ struct strset *dirs_removed)
{
/*
* When I checked in early 2020, over 76% of file renames in linux
continue;
record_rename_pair(dst_index, src_index, score);
renames++;
+ update_dir_rename_counts(info, dirs_removed,
+ one->path, two->path);
/*
* Found a rename so don't need text anymore; if we
return 1;
}
-static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, int copies)
+static int find_renames(struct diff_score *mx,
+ int dst_cnt,
+ int minimum_score,
+ int copies,
+ struct dir_rename_info *info,
+ struct strset *dirs_removed)
{
int count = 0, i;
continue;
record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
count++;
+ update_dir_rename_counts(info, dirs_removed,
+ rename_src[mx[i].src].p->one->path,
+ rename_dst[mx[i].dst].p->two->path);
}
return count;
}
info.setup = 0;
assert(!dir_rename_count || strmap_empty(dir_rename_count));
want_copies = (detect_rename == DIFF_DETECT_COPY);
+ if (dirs_removed && (break_idx || want_copies))
+ BUG("dirs_removed incompatible with break/copy detection");
if (!minimum_score)
minimum_score = DEFAULT_RENAME_SCORE;
/* Preparation for basename-driven matching. */
trace2_region_enter("diff", "dir rename setup", options->repo);
- initialize_dir_rename_info(&info);
+ initialize_dir_rename_info(&info,
+ dirs_removed, dir_rename_count);
trace2_region_leave("diff", "dir rename setup", options->repo);
/* Utilize file basenames to quickly find renames. */
trace2_region_enter("diff", "basename matches", options->repo);
rename_count += find_basename_matches(options,
min_basename_score,
- &info);
+ &info, dirs_removed);
trace2_region_leave("diff", "basename matches", options->repo);
/*
(uint64_t)num_destinations * (uint64_t)num_sources);
}
- mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_destinations),
- sizeof(*mx));
+ CALLOC_ARRAY(mx, st_mult(NUM_CANDIDATE_PER_DST, num_destinations));
for (dst_cnt = i = 0; i < rename_dst_nr; i++) {
struct diff_filespec *two = rename_dst[i].p->two;
struct diff_score *m;
/* cost matrix sorted by most to least similar pair */
STABLE_QSORT(mx, dst_cnt * NUM_CANDIDATE_PER_DST, score_compare);
- rename_count += find_renames(mx, dst_cnt, minimum_score, 0);
+ rename_count += find_renames(mx, dst_cnt, minimum_score, 0,
+ &info, dirs_removed);
if (want_copies)
- rename_count += find_renames(mx, dst_cnt, minimum_score, 1);
+ rename_count += find_renames(mx, dst_cnt, minimum_score, 1,
+ &info, dirs_removed);
free(mx);
trace2_region_leave("diff", "inexact renames", options->repo);
cleanup:
- /*
- * Now that renames have been computed, compute dir_rename_count */
- if (dirs_removed && dir_rename_count)
- compute_dir_rename_counts(dir_rename_count, dirs_removed);
-
/* At this point, we have found some renames and copies and they
* are recorded in rename_dst. The original list is still in *q.
*/
if (rename_dst[i].filespec_to_free)
free_filespec(rename_dst[i].filespec_to_free);
- cleanup_dir_rename_info(&info);
+ cleanup_dir_rename_info(&info, dirs_removed, dir_rename_count != NULL);
FREE_AND_NULL(rename_dst);
rename_dst_nr = rename_dst_alloc = 0;
FREE_AND_NULL(rename_src);