From e385e1b7d2d7f531a0006131e7f1d974de351df5 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Fri, 26 Sep 2025 22:41:58 +0000 Subject: [PATCH] xdiff: add macros DISCARD(0), KEEP(1), INVESTIGATE(2) in xprepare.c This commit is refactor-only; no behavior is changed. A future commit will use bool literals for changed[i]. The functions xdl_clean_mmatch() and xdl_cleanup_records() will be cleaned up more in a future patch series. The changes to xdl_cleanup_records(), in this patch, are just to make it clear why `char rchg` is refactored to `bool changed`. Rename dis* to action* and replace literal numericals with macros. The old names came from when dis* (which I think was short for discard) was treated like a boolean, but over time it grew into a ternary state machine. The result was confusing because dis* and rchg* both used 0/1 values with different meanings. The new names and macros make the states explicit. nm is short for number of matches, and mlim is a heuristic limit: nm == 0 -> action[i] = DISCARD -> changed[i] = true 0 < nm < mlim -> action[i] = KEEP -> changed[i] = false nm >= mlim -> action[i] = INVESTIGATE -> changed[i] = xdl_clean_mmatch() When need_min is true, only DISCARD and KEEP occur because the limit is effectively infinite. Best-viewed-with: --color-words Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xprepare.c | 106 ++++++++++++++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 37 deletions(-) diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index b9b19c36de..55e3b50ce6 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -29,6 +29,9 @@ #define XDL_GUESS_NLINES1 256 #define XDL_GUESS_NLINES2 20 +#define DISCARD 0 +#define KEEP 1 +#define INVESTIGATE 2 typedef struct s_xdlclass { struct s_xdlclass *next; @@ -190,15 +193,15 @@ void xdl_free_env(xdfenv_t *xe) { } -static int xdl_clean_mmatch(char const *dis, long i, long s, long e) { +static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) { long r, rdis0, rpdis0, rdis1, rpdis1; /* - * Limits the window the is examined during the similar-lines - * scan. The loops below stops when dis[i - r] == 1 (line that - * has no match), but there are corner cases where the loop - * proceed all the way to the extremities by causing huge - * performance penalties in case of big files. + * Limits the window that is examined during the similar-lines + * scan. The loops below stops when action[i - r] == KEEP + * (line that has no match), but there are corner cases where + * the loop proceed all the way to the extremities by causing + * huge performance penalties in case of big files. */ if (i - s > XDL_SIMSCAN_WINDOW) s = i - XDL_SIMSCAN_WINDOW; @@ -207,40 +210,47 @@ static int xdl_clean_mmatch(char const *dis, long i, long s, long e) { /* * Scans the lines before 'i' to find a run of lines that either - * have no match (dis[j] == 0) or have multiple matches (dis[j] > 1). - * Note that we always call this function with dis[i] > 1, so the - * current line (i) is already a multimatch line. + * have no match (action[j] == DISCARD) or have multiple matches + * (action[j] == INVESTIGATE). Note that we always call this + * function with action[i] == INVESTIGATE, so the current line + * (i) is already a multimatch line. */ for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) { - if (!dis[i - r]) + if (action[i - r] == DISCARD) rdis0++; - else if (dis[i - r] == 2) + else if (action[i - r] == INVESTIGATE) rpdis0++; - else + else if (action[i - r] == KEEP) break; + else + BUG("Illegal value for action[i - r]"); } /* - * If the run before the line 'i' found only multimatch lines, we - * return 0 and hence we don't make the current line (i) discarded. - * We want to discard multimatch lines only when they appear in the - * middle of runs with nomatch lines (dis[j] == 0). + * If the run before the line 'i' found only multimatch lines, + * we return false and hence we don't make the current line (i) + * discarded. We want to discard multimatch lines only when + * they appear in the middle of runs with nomatch lines + * (action[j] == DISCARD). */ if (rdis0 == 0) return 0; for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) { - if (!dis[i + r]) + if (action[i + r] == DISCARD) rdis1++; - else if (dis[i + r] == 2) + else if (action[i + r] == INVESTIGATE) rpdis1++; - else + else if (action[i + r] == KEEP) break; + else + BUG("Illegal value for action[i + r]"); } /* - * If the run after the line 'i' found only multimatch lines, we - * return 0 and hence we don't make the current line (i) discarded. + * If the run after the line 'i' found only multimatch lines, + * we return false and hence we don't make the current line (i) + * discarded. */ if (rdis1 == 0) - return 0; + return false; rdis1 += rdis0; rpdis1 += rpdis0; @@ -251,26 +261,38 @@ static int xdl_clean_mmatch(char const *dis, long i, long s, long e) { /* * Try to reduce the problem complexity, discard records that have no * matches on the other file. Also, lines that have multiple matches - * might be potentially discarded if they happear in a run of discardable. + * might be potentially discarded if they appear in a run of discardable. */ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) { long i, nm, nreff, mlim; xrecord_t *recs; xdlclass_t *rcrec; - char *dis, *dis1, *dis2; - int need_min = !!(cf->flags & XDF_NEED_MINIMAL); + uint8_t *action1 = NULL, *action2 = NULL; + bool need_min = !!(cf->flags & XDF_NEED_MINIMAL); + int ret = 0; - if (!XDL_CALLOC_ARRAY(dis, xdf1->nrec + xdf2->nrec + 2)) - return -1; - dis1 = dis; - dis2 = dis1 + xdf1->nrec + 1; + /* + * Create temporary arrays that will help us decide if + * changed[i] should remain 0 or become 1. + */ + if (!XDL_CALLOC_ARRAY(action1, xdf1->nrec + 1)) { + ret = -1; + goto cleanup; + } + if (!XDL_CALLOC_ARRAY(action2, xdf2->nrec + 1)) { + ret = -1; + goto cleanup; + } + /* + * Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE. + */ if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) { rcrec = cf->rcrecs[recs->ha]; nm = rcrec ? rcrec->len2 : 0; - dis1[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1; + action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; } if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT) @@ -278,32 +300,42 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) { rcrec = cf->rcrecs[recs->ha]; nm = rcrec ? rcrec->len1 : 0; - dis2[i] = (nm == 0) ? 0: (nm >= mlim && !need_min) ? 2: 1; + action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; } + /* + * Use temporary arrays to decide if changed[i] should remain + * 0 or become 1. + */ for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) { - if (dis1[i] == 1 || - (dis1[i] == 2 && !xdl_clean_mmatch(dis1, i, xdf1->dstart, xdf1->dend))) { + if (action1[i] == KEEP || + (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xdf1->dstart, xdf1->dend))) { xdf1->rindex[nreff++] = i; + /* changed[i] remains 0, i.e. keep */ } else xdf1->changed[i] = 1; + /* i.e. discard */ } xdf1->nreff = nreff; for (nreff = 0, i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) { - if (dis2[i] == 1 || - (dis2[i] == 2 && !xdl_clean_mmatch(dis2, i, xdf2->dstart, xdf2->dend))) { + if (action2[i] == KEEP || + (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xdf2->dstart, xdf2->dend))) { xdf2->rindex[nreff++] = i; + /* changed[i] remains 0, i.e. keep */ } else xdf2->changed[i] = 1; + /* i.e. discard */ } xdf2->nreff = nreff; - xdl_free(dis); +cleanup: + xdl_free(action1); + xdl_free(action2); - return 0; + return ret; } -- 2.47.3