]>
Commit | Line | Data |
---|---|---|
8af84dad JS |
1 | #include "cache.h" |
2 | #include "levenshtein.h" | |
3 | ||
850fb6ff JS |
4 | /* |
5 | * This function implements the Damerau-Levenshtein algorithm to | |
6 | * calculate a distance between strings. | |
7 | * | |
8 | * Basically, it says how many letters need to be swapped, substituted, | |
9 | * deleted from, or added to string1, at least, to get string2. | |
10 | * | |
11 | * The idea is to build a distance matrix for the substrings of both | |
12 | * strings. To avoid a large space complexity, only the last three rows | |
13 | * are kept in memory (if swaps had the same or higher cost as one deletion | |
14 | * plus one insertion, only two rows would be needed). | |
15 | * | |
16 | * At any stage, "i + 1" denotes the length of the current substring of | |
17 | * string1 that the distance is calculated for. | |
18 | * | |
19 | * row2 holds the current row, row1 the previous row (i.e. for the substring | |
20 | * of string1 of length "i"), and row0 the row before that. | |
21 | * | |
22 | * In other words, at the start of the big loop, row2[j + 1] contains the | |
23 | * Damerau-Levenshtein distance between the substring of string1 of length | |
24 | * "i" and the substring of string2 of length "j + 1". | |
25 | * | |
26 | * All the big loop does is determine the partial minimum-cost paths. | |
27 | * | |
28 | * It does so by calculating the costs of the path ending in characters | |
29 | * i (in string1) and j (in string2), respectively, given that the last | |
3ea3c215 | 30 | * operation is a substitution, a swap, a deletion, or an insertion. |
850fb6ff JS |
31 | * |
32 | * This implementation allows the costs to be weighted: | |
33 | * | |
34 | * - w (as in "sWap") | |
35 | * - s (as in "Substitution") | |
36 | * - a (for insertion, AKA "Add") | |
37 | * - d (as in "Deletion") | |
38 | * | |
39 | * Note that this algorithm calculates a distance _iff_ d == a. | |
40 | */ | |
8af84dad JS |
41 | int levenshtein(const char *string1, const char *string2, |
42 | int w, int s, int a, int d) | |
43 | { | |
44 | int len1 = strlen(string1), len2 = strlen(string2); | |
b32fa95f | 45 | int *row0, *row1, *row2; |
8af84dad JS |
46 | int i, j; |
47 | ||
b32fa95f JK |
48 | ALLOC_ARRAY(row0, len2 + 1); |
49 | ALLOC_ARRAY(row1, len2 + 1); | |
50 | ALLOC_ARRAY(row2, len2 + 1); | |
51 | ||
8af84dad JS |
52 | for (j = 0; j <= len2; j++) |
53 | row1[j] = j * a; | |
54 | for (i = 0; i < len1; i++) { | |
55 | int *dummy; | |
56 | ||
57 | row2[0] = (i + 1) * d; | |
58 | for (j = 0; j < len2; j++) { | |
59 | /* substitution */ | |
60 | row2[j + 1] = row1[j] + s * (string1[i] != string2[j]); | |
61 | /* swap */ | |
62 | if (i > 0 && j > 0 && string1[i - 1] == string2[j] && | |
63 | string1[i] == string2[j - 1] && | |
64 | row2[j + 1] > row0[j - 1] + w) | |
65 | row2[j + 1] = row0[j - 1] + w; | |
66 | /* deletion */ | |
13c6bcd4 | 67 | if (row2[j + 1] > row1[j + 1] + d) |
8af84dad JS |
68 | row2[j + 1] = row1[j + 1] + d; |
69 | /* insertion */ | |
70 | if (row2[j + 1] > row2[j] + a) | |
71 | row2[j + 1] = row2[j] + a; | |
72 | } | |
73 | ||
74 | dummy = row0; | |
75 | row0 = row1; | |
76 | row1 = row2; | |
77 | row2 = dummy; | |
78 | } | |
79 | ||
80 | i = row1[len2]; | |
81 | free(row0); | |
82 | free(row1); | |
83 | free(row2); | |
84 | ||
85 | return i; | |
86 | } |