From: Kenneth Finnegan Date: Thu, 15 Sep 2022 17:12:02 +0000 (-0700) Subject: Use string length diff heuristic to skip Levenshtein Algo (#369) X-Git-Tag: v3.2.7pre1~11 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8fe8cfd60af417f5467f7723075f5ad050b806c8;p=thirdparty%2Frsync.git Use string length diff heuristic to skip Levenshtein Algo (#369) When using the --fuzzy option to try and find close matches locally, the edit distance algorithm used is O(N^2), which can get painful on CPU constrained systems when working in folders with tens of thousands of files in it. The lower bound on the calculated Levenshtein distance is the difference of the two strings being compared, so if that difference is larger than the current best match, the calculation of the exact edit distance between the two strings can be skipped. Testing on the OpenSUSE package repo has shown a 50% reduction in the CPU time required to plan the rsync transaction. --- diff --git a/generator.c b/generator.c index 935c84f9..21c4a595 100644 --- a/generator.c +++ b/generator.c @@ -875,9 +875,12 @@ static struct file_struct *find_fuzzy(struct file_struct *file, struct file_list len = strlen(name); suf = find_filename_suffix(name, len, &suf_len); - dist = fuzzy_distance(name, len, fname, fname_len); - /* Add some extra weight to how well the suffixes match. */ - dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len) * 10; + dist = fuzzy_distance(name, len, fname, fname_len, lowest_dist); + /* Add some extra weight to how well the suffixes match unless we've already disqualified + * this file based on a heuristic. */ + if (dist < 0xFFFF0000U) { + dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len, 0xFFFF0000U) * 10; + } if (DEBUG_GTE(FUZZY, 2)) { rprintf(FINFO, "fuzzy distance for %s = %d.%05d\n", f_name(fp, NULL), (int)(dist>>16), (int)(dist&0xFFFF)); diff --git a/util1.c b/util1.c index 671f3c75..da50ff1e 100644 --- a/util1.c +++ b/util1.c @@ -1487,12 +1487,19 @@ const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr) #define UNIT (1 << 16) -uint32 fuzzy_distance(const char *s1, unsigned len1, const char *s2, unsigned len2) +uint32 fuzzy_distance(const char *s1, unsigned len1, const char *s2, unsigned len2, uint32 upperlimit) { uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc; int32 cost; unsigned i1, i2; + /* Check to see if the Levenshtein distance must be greater than the + * upper limit defined by the previously found lowest distance using + * the heuristic that the Levenshtein distance is greater than the + * difference in length of the two strings */ + if ((len1 > len2 ? len1 - len2 : len2 - len1) * UNIT > upperlimit) + return 0xFFFFU * UNIT + 1; + if (!len1 || !len2) { if (!len1) { s1 = s2;