From: Bruno Haible Date: Sat, 18 Aug 2007 11:11:06 +0000 (+0000) Subject: Move module fstrcmp to gnulib. X-Git-Tag: v0.17~289 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7117ce746fe02dde115d5ba2906c82f91854b902;p=thirdparty%2Fgettext.git Move module fstrcmp to gnulib. --- diff --git a/gnulib-local/ChangeLog b/gnulib-local/ChangeLog index 426534456..10c9d11c6 100644 --- a/gnulib-local/ChangeLog +++ b/gnulib-local/ChangeLog @@ -1,3 +1,11 @@ +2007-08-18 Bruno Haible + + * modules/fstrcmp: Remove file, moved to gnulib. + * lib/fstrcmp.h: Remove file, moved to gnulib. + * lib/fstrcmp.c: Remove file, moved to gnulib. + * lib/diffseq.h: Remove file, moved to gnulib. + * Makefile.am (EXTRA_DIST): Remove them. + 2007-07-01 Bruno Haible * build-aux/moopp (func_version): Use the standard --version output, diff --git a/gnulib-local/lib/diffseq.h b/gnulib-local/lib/diffseq.h deleted file mode 100644 index 59da5fc5d..000000000 --- a/gnulib-local/lib/diffseq.h +++ /dev/null @@ -1,481 +0,0 @@ -/* Analyze differences between two vectors. - Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006 Free Software Foundation, Inc. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software Foundation, - Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ - - -/* The basic idea is to consider two vectors as similar if, when - transforming the first vector into the second vector through a - sequence of edits (inserts and deletes of one element each), - this sequence is short - or equivalently, if the ordered list - of elements that are untouched by these edits is long. For a - good introduction to the subject, read about the "Levenshtein - distance" in Wikipedia. - - The basic algorithm is described in: - "An O(ND) Difference Algorithm and its Variations", Eugene Myers, - Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; - see especially section 4.2, which describes the variation used below. - - The basic algorithm was independently discovered as described in: - "Algorithms for Approximate String Matching", E. Ukkonen, - Information and Control Vol. 64, 1985, pp. 100-118. - - Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE - heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) - at the price of producing suboptimal output for large inputs with - many differences. */ - -/* Before including this file, you need to define: - ELEMENT The element type of the vectors being compared. - EQUAL A two-argument macro that tests two elements for - equality. - OFFSET A signed integer type sufficient to hold the - difference between two indices. Usually - something like ssize_t. - EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'. - NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff]. - NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff]. - USE_HEURISTIC (Optional) Define if you want to support the - heuristic for large vectors. */ - -/* Maximum value of type OFFSET. */ -#define OFFSET_MAX \ - ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1) - -/* Use this to suppress gcc's `...may be used before initialized' warnings. */ -#ifndef IF_LINT -# ifdef lint -# define IF_LINT(Code) Code -# else -# define IF_LINT(Code) /* empty */ -# endif -#endif - -/* - * Context of comparison operation. - */ -struct context -{ - /* Vectors being compared. */ - const ELEMENT *xvec; - const ELEMENT *yvec; - - /* Extra fields. */ - EXTRA_CONTEXT_FIELDS - - /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point - furthest along the given diagonal in the forward search of the edit - matrix. */ - OFFSET *fdiag; - - /* Vector, indexed by diagonal, containing the X coordinate of the point - furthest along the given diagonal in the backward search of the edit - matrix. */ - OFFSET *bdiag; - - #ifdef USE_HEURISTIC - /* This corresponds to the diff -H flag. With this heuristic, for - vectors with a constant small density of changes, the algorithm is - linear in the vectors size. */ - bool heuristic; - #endif - - /* Edit scripts longer than this are too expensive to compute. */ - OFFSET too_expensive; - - /* Snakes bigger than this are considered `big'. */ - #define SNAKE_LIMIT 20 -}; - -struct partition -{ - /* Midpoints of this partition. */ - OFFSET xmid; - OFFSET ymid; - - /* True if low half will be analyzed minimally. */ - bool lo_minimal; - - /* Likewise for high half. */ - bool hi_minimal; -}; - - -/* Find the midpoint of the shortest edit script for a specified portion - of the two vectors. - - Scan from the beginnings of the vectors, and simultaneously from the ends, - doing a breadth-first search through the space of edit-sequence. - When the two searches meet, we have found the midpoint of the shortest - edit sequence. - - If FIND_MINIMAL is true, find the minimal edit script regardless of - expense. Otherwise, if the search is too expensive, use heuristics to - stop the search and report a suboptimal answer. - - Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number - XMID - YMID equals the number of inserted elements minus the number - of deleted elements (counting only elements before the midpoint). - - Set PART->lo_minimal to true iff the minimal edit script for the - left half of the partition is known; similarly for PART->hi_minimal. - - This function assumes that the first elements of the specified portions - of the two vectors do not match, and likewise that the last elements do not - match. The caller must trim matching elements from the beginning and end - of the portions it is going to specify. - - If we return the "wrong" partitions, the worst this can do is cause - suboptimal diff output. It cannot cause incorrect diff output. */ - -static void -diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal, - struct partition *part, struct context *ctxt) -{ - OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */ - OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */ - const ELEMENT *const xv = ctxt->xvec; /* Still more help for the compiler. */ - const ELEMENT *const yv = ctxt->yvec; /* And more and more . . . */ - const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */ - const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */ - const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */ - const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ - OFFSET fmin = fmid; - OFFSET fmax = fmid; /* Limits of top-down search. */ - OFFSET bmin = bmid; - OFFSET bmax = bmid; /* Limits of bottom-up search. */ - OFFSET c; /* Cost. */ - bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd - diagonal with respect to the northwest. */ - - fd[fmid] = xoff; - bd[bmid] = xlim; - - for (c = 1;; ++c) - { - OFFSET d; /* Active diagonal. */ - bool big_snake = false; - - /* Extend the top-down search by an edit step in each diagonal. */ - if (fmin > dmin) - fd[--fmin - 1] = -1; - else - ++fmin; - if (fmax < dmax) - fd[++fmax + 1] = -1; - else - --fmax; - for (d = fmax; d >= fmin; d -= 2) - { - OFFSET x; - OFFSET y; - OFFSET oldx; - OFFSET tlo = fd[d - 1]; - OFFSET thi = fd[d + 1]; - - if (tlo >= thi) - x = tlo + 1; - else - x = thi; - oldx = x; - y = x - d; - while (x < xlim && y < ylim && EQUAL (xv[x], yv[y])) - { - ++x; - ++y; - } - if (x - oldx > SNAKE_LIMIT) - big_snake = true; - fd[d] = x; - if (odd && bmin <= d && d <= bmax && bd[d] <= x) - { - part->xmid = x; - part->ymid = y; - part->lo_minimal = part->hi_minimal = true; - return; - } - } - - /* Similarly extend the bottom-up search. */ - if (bmin > dmin) - bd[--bmin - 1] = OFFSET_MAX; - else - ++bmin; - if (bmax < dmax) - bd[++bmax + 1] = OFFSET_MAX; - else - --bmax; - for (d = bmax; d >= bmin; d -= 2) - { - OFFSET x; - OFFSET y; - OFFSET oldx; - OFFSET tlo = bd[d - 1]; - OFFSET thi = bd[d + 1]; - - if (tlo < thi) - x = tlo; - else - x = thi - 1; - oldx = x; - y = x - d; - while (x > xoff && y > yoff && EQUAL (xv[x - 1], yv[y - 1])) - { - --x; - --y; - } - if (oldx - x > SNAKE_LIMIT) - big_snake = true; - bd[d] = x; - if (!odd && fmin <= d && d <= fmax && x <= fd[d]) - { - part->xmid = x; - part->ymid = y; - part->lo_minimal = part->hi_minimal = true; - return; - } - } - - if (find_minimal) - continue; - -#ifdef USE_HEURISTIC - /* Heuristic: check occasionally for a diagonal that has made lots - of progress compared with the edit distance. If we have any - such, find the one that has made the most progress and return it - as if it had succeeded. - - With this heuristic, for vectors with a constant small density - of changes, the algorithm is linear in the vector size. */ - - if (c > 200 && big_snake && ctxt->heuristic) - { - OFFSET best; - - best = 0; - for (d = fmax; d >= fmin; d -= 2) - { - OFFSET dd = d - fmid; - OFFSET x = fd[d]; - OFFSET y = x - d; - OFFSET v = (x - xoff) * 2 - dd; - - if (v > 12 * (c + (dd < 0 ? -dd : dd))) - { - if (v > best - && xoff + SNAKE_LIMIT <= x && x < xlim - && yoff + SNAKE_LIMIT <= y && y < ylim) - { - /* We have a good enough best diagonal; now insist - that it end with a significant snake. */ - int k; - - for (k = 1; EQUAL (xv[x - k], yv[y - k]); k++) - if (k == SNAKE_LIMIT) - { - best = v; - part->xmid = x; - part->ymid = y; - break; - } - } - } - } - if (best > 0) - { - part->lo_minimal = true; - part->hi_minimal = false; - return; - } - - best = 0; - for (d = bmax; d >= bmin; d -= 2) - { - OFFSET dd = d - bmid; - OFFSET x = bd[d]; - OFFSET y = x - d; - OFFSET v = (xlim - x) * 2 + dd; - - if (v > 12 * (c + (dd < 0 ? -dd : dd))) - { - if (v > best - && xoff < x && x <= xlim - SNAKE_LIMIT - && yoff < y && y <= ylim - SNAKE_LIMIT) - { - /* We have a good enough best diagonal; now insist - that it end with a significant snake. */ - int k; - - for (k = 0; EQUAL (xv[x + k], yv[y + k]); k++) - if (k == SNAKE_LIMIT - 1) - { - best = v; - part->xmid = x; - part->ymid = y; - break; - } - } - } - } - if (best > 0) - { - part->lo_minimal = false; - part->hi_minimal = true; - return; - } - } -#endif /* USE_HEURISTIC */ - - /* Heuristic: if we've gone well beyond the call of duty, give up - and report halfway between our best results so far. */ - if (c >= ctxt->too_expensive) - { - OFFSET fxybest; - OFFSET fxbest IF_LINT (= 0); - OFFSET bxybest; - OFFSET bxbest IF_LINT (= 0); - - /* Find forward diagonal that maximizes X + Y. */ - fxybest = -1; - for (d = fmax; d >= fmin; d -= 2) - { - OFFSET x; - OFFSET y; - - x = MIN (fd[d], xlim); - y = x - d; - if (ylim < y) - { - x = ylim + d; - y = ylim; - } - if (fxybest < x + y) - { - fxybest = x + y; - fxbest = x; - } - } - - /* Find backward diagonal that minimizes X + Y. */ - bxybest = OFFSET_MAX; - for (d = bmax; d >= bmin; d -= 2) - { - OFFSET x; - OFFSET y; - - x = MAX (xoff, bd[d]); - y = x - d; - if (y < yoff) - { - x = yoff + d; - y = yoff; - } - if (x + y < bxybest) - { - bxybest = x + y; - bxbest = x; - } - } - - /* Use the better of the two diagonals. */ - if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) - { - part->xmid = fxbest; - part->ymid = fxybest - fxbest; - part->lo_minimal = true; - part->hi_minimal = false; - } - else - { - part->xmid = bxbest; - part->ymid = bxybest - bxbest; - part->lo_minimal = false; - part->hi_minimal = true; - } - return; - } - } -} - - -/* Compare in detail contiguous subsequences of the two vectors - which are known, as a whole, to match each other. - - The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1. - - Note that XLIM, YLIM are exclusive bounds. All indices into the vectors - are origin-0. - - If FIND_MINIMAL, find a minimal difference no matter how - expensive it is. - - The results are recorded by invoking NOTE_DELETE and NOTE_INSERT. */ - -static void -compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, - bool find_minimal, struct context *ctxt) -{ - const ELEMENT *const xv = ctxt->xvec; /* Help the compiler. */ - const ELEMENT *const yv = ctxt->yvec; - - /* Slide down the bottom initial diagonal. */ - while (xoff < xlim && yoff < ylim && EQUAL (xv[xoff], yv[yoff])) - { - ++xoff; - ++yoff; - } - - /* Slide up the top initial diagonal. */ - while (xlim > xoff && ylim > yoff && EQUAL (xv[xlim - 1], yv[ylim - 1])) - { - --xlim; - --ylim; - } - - /* Handle simple cases. */ - if (xoff == xlim) - while (yoff < ylim) - { - NOTE_INSERT (ctxt, yoff); - yoff++; - } - else if (yoff == ylim) - while (xoff < xlim) - { - NOTE_DELETE (ctxt, xoff); - xoff++; - } - else - { - struct partition part; - - /* Find a point of correspondence in the middle of the vectors. */ - diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt); - - /* Use the partitions to split this problem into subproblems. */ - compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt); - compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt); - } -} - -#undef ELEMENT -#undef EQUAL -#undef OFFSET -#undef EXTRA_CONTEXT_FIELDS -#undef NOTE_DELETE -#undef NOTE_INSERT -#undef USE_HEURISTIC -#undef OFFSET_MAX diff --git a/gnulib-local/lib/fstrcmp.c b/gnulib-local/lib/fstrcmp.c deleted file mode 100644 index 90141d9d7..000000000 --- a/gnulib-local/lib/fstrcmp.c +++ /dev/null @@ -1,186 +0,0 @@ -/* Functions to make fuzzy comparisons between strings - Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software Foundation, - Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - - - Derived from GNU diff 2.7, analyze.c et al. - - The basic idea is to consider two vectors as similar if, when - transforming the first vector into the second vector through a - sequence of edits (inserts and deletes of one element each), - this sequence is short - or equivalently, if the ordered list - of elements that are untouched by these edits is long. For a - good introduction to the subject, read about the "Levenshtein - distance" in Wikipedia. - - The basic algorithm is described in: - "An O(ND) Difference Algorithm and its Variations", Eugene Myers, - Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; - see especially section 4.2, which describes the variation used below. - - The basic algorithm was independently discovered as described in: - "Algorithms for Approximate String Matching", E. Ukkonen, - Information and Control Vol. 64, 1985, pp. 100-118. - - Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE - heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) - at the price of producing suboptimal output for large inputs with - many differences. */ - -#include - -/* Specification. */ -#include "fstrcmp.h" - -#include -#include -#include -#include -#include - -#include "lock.h" -#include "tls.h" -#include "minmax.h" -#include "xalloc.h" - -#ifndef uintptr_t -# define uintptr_t unsigned long -#endif - - -#define ELEMENT char -#define EQUAL(x,y) ((x) == (y)) -#define OFFSET int -#define EXTRA_CONTEXT_FIELDS \ - /* The number of elements inserted or deleted. */ \ - int xvec_edit_count; \ - int yvec_edit_count; -#define NOTE_DELETE(ctxt, xoff) ctxt->xvec_edit_count++ -#define NOTE_INSERT(ctxt, yoff) ctxt->yvec_edit_count++ -/* We don't need USE_HEURISTIC, since it is unlikely in typical uses of - fstrcmp(). */ -#include "diffseq.h" - - -/* Because fstrcmp is typically called multiple times, attempt to minimize - the number of memory allocations performed. Thus, let a call reuse the - memory already allocated by the previous call, if it is sufficient. - To make it multithread-safe, without need for a lock that protects the - already allocated memory, store the allocated memory per thread. Free - it only when the thread exits. */ - -static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */ -static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */ - -static void -keys_init (void) -{ - gl_tls_key_init (buffer_key, free); - gl_tls_key_init (bufmax_key, NULL); - /* The per-thread initial values are NULL and 0, respectively. */ -} - -/* Ensure that keys_init is called once only. */ -gl_once_define(static, keys_init_once) - - -/* NAME - fstrcmp - fuzzy string compare - - SYNOPSIS - double fstrcmp(const char *, const char *); - - DESCRIPTION - The fstrcmp function may be used to compare two string for - similarity. It is very useful in reducing "cascade" or - "secondary" errors in compilers or other situations where - symbol tables occur. - - RETURNS - double; 0 if the strings are entirly dissimilar, 1 if the - strings are identical, and a number in between if they are - similar. */ - -double -fstrcmp (const char *string1, const char *string2) -{ - struct context ctxt; - int xvec_length; - int yvec_length; - int i; - - size_t fdiag_len; - int *buffer; - size_t bufmax; - - /* set the info for each string. */ - ctxt.xvec = string1; - xvec_length = strlen (string1); - ctxt.yvec = string2; - yvec_length = strlen (string2); - - /* short-circuit obvious comparisons */ - if (xvec_length == 0 && yvec_length == 0) - return 1.0; - if (xvec_length == 0 || yvec_length == 0) - return 0.0; - - /* Set TOO_EXPENSIVE to be approximate square root of input size, - bounded below by 256. */ - ctxt.too_expensive = 1; - for (i = xvec_length + yvec_length; - i != 0; - i >>= 2) - ctxt.too_expensive <<= 1; - if (ctxt.too_expensive < 256) - ctxt.too_expensive = 256; - - /* Allocate memory for fdiag and bdiag from a thread-local pool. */ - fdiag_len = xvec_length + yvec_length + 3; - gl_once (keys_init_once, keys_init); - buffer = (int *) gl_tls_get (buffer_key); - bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key); - if (fdiag_len > bufmax) - { - /* Need more memory. */ - bufmax = 2 * bufmax; - if (fdiag_len > bufmax) - bufmax = fdiag_len; - /* Calling xrealloc would be a waste: buffer's contents does not need - to be preserved. */ - if (buffer != NULL) - free (buffer); - buffer = (int *) xnmalloc (bufmax, 2 * sizeof (int)); - gl_tls_set (buffer_key, buffer); - gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax); - } - ctxt.fdiag = buffer + yvec_length + 1; - ctxt.bdiag = ctxt.fdiag + fdiag_len; - - /* Now do the main comparison algorithm */ - ctxt.xvec_edit_count = 0; - ctxt.yvec_edit_count = 0; - compareseq (0, xvec_length, 0, yvec_length, 0, - &ctxt); - - /* The result is - ((number of chars in common) / (average length of the strings)). - This is admittedly biased towards finding that the strings are - similar, however it does produce meaningful results. */ - return ((double) (xvec_length + yvec_length - - ctxt.yvec_edit_count - ctxt.xvec_edit_count) - / (xvec_length + yvec_length)); -} diff --git a/gnulib-local/lib/fstrcmp.h b/gnulib-local/lib/fstrcmp.h deleted file mode 100644 index 4671c55d0..000000000 --- a/gnulib-local/lib/fstrcmp.h +++ /dev/null @@ -1,35 +0,0 @@ -/* Fuzzy string comparison. - Copyright (C) 1995, 2000, 2002-2003, 2006 Free Software Foundation, Inc. - - This file was written by Peter Miller - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software Foundation, -Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ - -#ifndef _FSTRCMP_H -#define _FSTRCMP_H - -#ifdef __cplusplus -extern "C" { -#endif - -/* Fuzzy compare of S1 and S2. Return a measure for the similarity of S1 - and S1. The higher the result, the more similar the strings are. */ -extern double fstrcmp (const char *s1, const char *s2); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/gnulib-local/modules/fstrcmp b/gnulib-local/modules/fstrcmp deleted file mode 100644 index db14d5d40..000000000 --- a/gnulib-local/modules/fstrcmp +++ /dev/null @@ -1,28 +0,0 @@ -Description: -Fuzzy string comparison. - -Files: -lib/fstrcmp.h -lib/fstrcmp.c -lib/diffseq.h - -Depends-on: -lock -tls -minmax -xalloc - -configure.ac: - -Makefile.am: -lib_SOURCES += fstrcmp.h fstrcmp.c - -Include: -"fstrcmp.h" - -License: -GPL - -Maintainer: -Bruno Haible -