Update comments. Talk about vectors instead of files/strings, and about

author Bruno Haible <bruno@clisp.org>

Sat, 7 Oct 2006 16:25:52 +0000 (16:25 +0000)

committer Bruno Haible <bruno@clisp.org>

Sat, 7 Oct 2006 16:25:52 +0000 (16:25 +0000)
author Bruno Haible <bruno@clisp.org>
Sat, 7 Oct 2006 16:25:52 +0000 (16:25 +0000)
committer Bruno Haible <bruno@clisp.org>
Sat, 7 Oct 2006 16:25:52 +0000 (16:25 +0000)
diff --git a/lib/diffseq.h b/lib/diffseq.h

index ce4e3ed91d4b65088a4bdb78187b17468ba9d0d2..6a348ea8937ec460c818ebd4505334dc1a250f9c 100644 (file)
--- a/lib/diffseq.h
+++ b/lib/diffseq.h
@@ -1,4 +1,4 @@
-/* Analyze differences between two sequences.
+/* Analyze differences between two vectors.
     Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006 Free Software Foundation, Inc.
  
     This program is free software; you can redistribute it and/or modify
@@ -16,21 +16,30 @@
     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  
  
-/* The basic algorithm is described in:
+/* The basic idea is to consider two vectors as similar if, when
+   transforming the first vector into the second vector through a
+   sequence of edits (inserts and deletes of one character each),
+   this sequence is short - or equivalently, if the ordered list
+   of elements that are untouched by these edits is long.  For a
+   good introduction to the subject, read about the "Levenshtein
+   distance" in Wikipedia.
+
+   The basic algorithm is described in:
     "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
     Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
     see especially section 4.2, which describes the variation used below.
-   Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
-   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
-   at the price of producing suboptimal output for large inputs with
-   many differences.
  
     The basic algorithm was independently discovered as described in:
     "Algorithms for Approximate String Matching", E. Ukkonen,
-   Information and Control Vol. 64, 1985, pp. 100-118.  */
+   Information and Control Vol. 64, 1985, pp. 100-118.
+
+   Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
+   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
+   at the price of producing suboptimal output for large inputs with
+   many differences.  */
  
  /* Before including this file, you need to define:
-     ELEMENT                 The element type of the sequences being compared.
+     ELEMENT                 The element type of the vectors being compared.
       EQUAL                   A two-argument macro that tests two elements for
                               equality.
       OFFSET                  A signed integer type sufficient to hold the
@@ -70,10 +79,10 @@ struct partition
    bool hi_minimal;
  };
  
-/* Find the midpoint of the shortest edit script for a specified
-   portion of the two files.
+/* Find the midpoint of the shortest edit script for a specified portion
+   of the two vectors.
  
-   Scan from the beginnings of the files, and simultaneously from the ends,
+   Scan from the beginnings of the vectors, and simultaneously from the ends,
     doing a breadth-first search through the space of edit-sequence.
     When the two searches meet, we have found the midpoint of the shortest
     edit sequence.
@@ -83,20 +92,19 @@ struct partition
     heuristics to stop the search and report a suboptimal answer.
  
     Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
-   XMID - YMID equals the number of inserted lines minus the number
-   of deleted lines (counting only lines before the midpoint).
+   XMID - YMID equals the number of inserted elements minus the number
+   of deleted elements (counting only elements before the midpoint).
  
     Set PART->lo_minimal to true iff the minimal edit script for the
     left half of the partition is known; similarly for PART->hi_minimal.
  
-   This function assumes that the first lines of the specified portions
-   of the two files do not match, and likewise that the last lines do not
-   match.  The caller must trim matching lines from the beginning and end
+   This function assumes that the first elements of the specified portions
+   of the two vectors do not match, and likewise that the last elements do not
+   match.  The caller must trim matching elements from the beginning and end
     of the portions it is going to specify.
  
-   If we return the "wrong" partitions,
-   the worst this can do is cause suboptimal diff output.
-   It cannot cause incorrect diff output.  */
+   If we return the "wrong" partitions, the worst this can do is cause
+   suboptimal diff output.  It cannot cause incorrect diff output.  */
  
  static void
  diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
@@ -204,8 +212,8 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
          If we have any such, find the one that has made the most
          progress and return it as if it had succeeded.
  
-        With this heuristic, for files with a constant small density
-        of changes, the algorithm is linear in the file size.  */
+        With this heuristic, for vectors with a constant small density
+        of changes, the algorithm is linear in the vector size.  */
  
        if (c > 200 && big_snake && speed_large_files)
         {
@@ -340,16 +348,16 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
      }
  }
  
-/* Compare in detail contiguous subsequences of the two files
+/* Compare in detail contiguous subsequences of the two vectors
     which are known, as a whole, to match each other.
  
     The results are recorded in the vectors files[N].changed, by
     storing 1 in the element for each line that is an insertion or deletion.
  
-   The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
+   The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
  
     Note that XLIM, YLIM are exclusive bounds.
-   All line numbers are origin-0 and discarded lines are not counted.
+   All line numbers are origin-0 and discarded elements are not counted.
  
     If FIND_MINIMAL, find a minimal difference no matter how
     expensive it is.  */
@@ -384,7 +392,7 @@ compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minima
      {
        struct partition part IF_LINT (= {0});
  
-      /* Find a point of correspondence in the middle of the files.  */
+      /* Find a point of correspondence in the middle of the vectors.  */
        diag (xoff, xlim, yoff, ylim, find_minimal, &part);
  
        /* Use the partitions to split this problem into subproblems.  */
diff --git a/lib/fstrcmp.c b/lib/fstrcmp.c

index 3368a4e01c0ab50d46c5736c7f8206eba762eab5..e9b4d0ee20575acddc5dd9d9c3a9d645cddd4e15 100644 (file)
--- a/lib/fstrcmp.c
+++ b/lib/fstrcmp.c
@@ -18,11 +18,11 @@
  
     Derived from GNU diff 2.7, analyze.c et al.
  
-   The basic idea is to consider two strings as similar if, when
-   transforming the first string into the second string through a
+   The basic idea is to consider two sequences as similar if, when
+   transforming the first sequence into the second sequence through a
     sequence of edits (inserts and deletes of one character each),
     this sequence is short - or equivalently, if the ordered list
-   of characters that are untouched by these edits is long.  For a
+   of elements that are untouched by these edits is long.  For a
     good introduction to the subject, read about the "Levenshtein
     distance" in Wikipedia.
  
@@ -35,13 +35,10 @@
     "Algorithms for Approximate String Matching", E. Ukkonen,
     Information and Control Vol. 64, 1985, pp. 100-118.
  
-   Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
+   Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
     heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
     at the price of producing suboptimal output for large inputs with
-   many differences.
-
-   Modified to work on strings rather than files
-   by Peter Miller <pmiller@agso.gov.au>, October 1995 */
+   many differences.  */
  
  #include <config.h>
  
@@ -67,10 +64,6 @@
  #define EQUAL(x,y) ((x) == (y))
  #define OFFSET int
  
-/* Maximum value of type OFFSET.  */
-#define OFFSET_MAX \
-  ((((OFFSET)1 << (sizeof (OFFSET_MAX) * CHAR_BIT - 2)) - 1) * 2 + 1)
-
  /* Before including this file, you need to define:
       ELEMENT                 The element type of the sequences being compared.
       EQUAL                   A two-argument macro that tests two elements for
@@ -79,6 +72,10 @@
                               difference between two indices. Usually
                               something like ssize_t.  */
  
+/* Maximum value of type OFFSET.  */
+#define OFFSET_MAX \
+  ((((OFFSET)1 << (sizeof (OFFSET_MAX) * CHAR_BIT - 2)) - 1) * 2 + 1)
+
  /*
   * Context of comparison operation.
   */
@@ -95,7 +92,7 @@ struct context
      /* The length of the string to be compared. */
      int data_length;
  
-    /* The number of characters inserted or deleted. */
+    /* The number of elements inserted or deleted. */
      int edit_count;
    }
    string[2];
@@ -103,8 +100,8 @@ struct context
    #ifdef MINUS_H_FLAG
  
    /* This corresponds to the diff -H flag.  With this heuristic, for
-     strings with a constant small density of changes, the algorithm is
-     linear in the strings size.  This is unlikely in typical uses of
+     vectors with a constant small density of changes, the algorithm is
+     linear in the vectors size.  This is unlikely in typical uses of
       fstrcmp, and so is usually compiled out.  Besides, there is no
       interface to set it true.  */
    int heuristic;
@@ -150,9 +147,9 @@ struct partition
  
     DESCRIPTION
         Find the midpoint of the shortest edit script for a specified
-       portion of the two strings.
+       portion of the two vectors.
  
-       Scan from the beginnings of the strings, and simultaneously from
+       Scan from the beginnings of the vectors, and simultaneously from
         the ends, doing a breadth-first search through the space of
         edit-sequence.  When the two searches meet, we have found the
         midpoint of the shortest edit sequence.
@@ -163,22 +160,22 @@ struct partition
  
     RETURNS
         Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
-       number XMID - YMID equals the number of inserted characters
-       minus the number of deleted characters (counting only characters
+       number XMID - YMID equals the number of inserted elements
+       minus the number of deleted elements (counting only elements
         before the midpoint).  Return the approximate edit cost; this is
-       the total number of characters inserted or deleted (counting
-       only characters before the midpoint), unless a heuristic is used
+       the total number of elements inserted or deleted (counting
+       only elements before the midpoint), unless a heuristic is used
         to terminate the search prematurely.
  
-       Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
+       Set PART->lo_minimal to nonzero iff the minimal edit script
         for the left half of the partition is known; similarly for
-       PART->RIGHT_MINIMAL.
+       PART->hi_minimal.
  
     CAVEAT
-       This function assumes that the first characters of the specified
-       portions of the two strings do not match, and likewise that the
-       last characters do not match.  The caller must trim matching
-       characters from the beginning and end of the portions it is
+       This function assumes that the first elements of the specified
+       portions of the two vectors do not match, and likewise that the
+       last elements do not match.  The caller must trim matching
+       elements from the beginning and end of the portions it is
         going to specify.
  
         If we return the "wrong" partitions, the worst this can do is
@@ -310,8 +307,8 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
           such, find the one that has made the most progress and return
           it as if it had succeeded.
  
-         With this heuristic, for strings with a constant small density
-         of changes, the algorithm is linear in the strings size.  */
+         With this heuristic, for vectors with a constant small density
+         of changes, the algorithm is linear in the vector size.  */
        if (c > 200 && big_snake && ctxt->heuristic)
         {
           OFFSET best;
@@ -487,11 +484,11 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
                         struct context *ctxt);
  
     DESCRIPTION
-       Compare in detail contiguous subsequences of the two strings
+       Compare in detail contiguous subsequences of the two vectors
         which are known, as a whole, to match each other.
  
-       The subsequence of string 0 is [XOFF, XLIM) and likewise for
-       string 1.
+       The subsequence of vector 0 is [XOFF, XLIM) and likewise for
+       vector 1.
  
         Note that XLIM, YLIM are exclusive bounds.  All character
         numbers are origin-0.
@@ -542,7 +539,7 @@ compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minima
        OFFSET c;
        struct partition part;
  
-      /* Find a point of correspondence in the middle of the strings.  */
+      /* Find a point of correspondence in the middle of the vectors.  */
        c = diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
        if (c == 1)
         {
author	Bruno Haible <bruno@clisp.org>
	Sat, 7 Oct 2006 16:25:52 +0000 (16:25 +0000)
committer	Bruno Haible <bruno@clisp.org>
	Sat, 7 Oct 2006 16:25:52 +0000 (16:25 +0000)
lib/diffseq.h		patch \| blob \| blame \| history
lib/fstrcmp.c		patch \| blob \| blame \| history