]>
Commit | Line | Data |
---|---|---|
b279775f | 1 | /* Find near-matches for strings. |
f1717362 | 2 | Copyright (C) 2015-2016 Free Software Foundation, Inc. |
b67f0560 | 3 | |
4 | This file is part of GCC. | |
5 | ||
6 | GCC is free software; you can redistribute it and/or modify it under | |
7 | the terms of the GNU General Public License as published by the Free | |
8 | Software Foundation; either version 3, or (at your option) any later | |
9 | version. | |
10 | ||
11 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
12 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 | for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with GCC; see the file COPYING3. If not see | |
18 | <http://www.gnu.org/licenses/>. */ | |
19 | ||
20 | #include "config.h" | |
21 | #include "system.h" | |
22 | #include "coretypes.h" | |
23 | #include "tm.h" | |
24 | #include "tree.h" | |
25 | #include "spellcheck.h" | |
26 | ||
27 | /* The Levenshtein distance is an "edit-distance": the minimal | |
28 | number of one-character insertions, removals or substitutions | |
29 | that are needed to change one string into another. | |
30 | ||
31 | This implementation uses the Wagner-Fischer algorithm. */ | |
32 | ||
b279775f | 33 | edit_distance_t |
b67f0560 | 34 | levenshtein_distance (const char *s, int len_s, |
35 | const char *t, int len_t) | |
36 | { | |
37 | const bool debug = false; | |
38 | ||
39 | if (debug) | |
40 | { | |
41 | printf ("s: \"%s\" (len_s=%i)\n", s, len_s); | |
42 | printf ("t: \"%s\" (len_t=%i)\n", t, len_t); | |
43 | } | |
44 | ||
45 | if (len_s == 0) | |
46 | return len_t; | |
47 | if (len_t == 0) | |
48 | return len_s; | |
49 | ||
50 | /* We effectively build a matrix where each (i, j) contains the | |
51 | Levenshtein distance between the prefix strings s[0:j] | |
52 | and t[0:i]. | |
53 | Rather than actually build an (len_t + 1) * (len_s + 1) matrix, | |
54 | we simply keep track of the last row, v0 and a new row, v1, | |
55 | which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses | |
56 | in favor of two (len_s + 1) allocations. These could potentially be | |
57 | statically-allocated if we impose a maximum length on the | |
58 | strings of interest. */ | |
59 | edit_distance_t *v0 = new edit_distance_t[len_s + 1]; | |
60 | edit_distance_t *v1 = new edit_distance_t[len_s + 1]; | |
61 | ||
62 | /* The first row is for the case of an empty target string, which | |
63 | we can reach by deleting every character in the source string. */ | |
64 | for (int i = 0; i < len_s + 1; i++) | |
65 | v0[i] = i; | |
66 | ||
67 | /* Build successive rows. */ | |
68 | for (int i = 0; i < len_t; i++) | |
69 | { | |
70 | if (debug) | |
71 | { | |
72 | printf ("i:%i v0 = ", i); | |
73 | for (int j = 0; j < len_s + 1; j++) | |
74 | printf ("%i ", v0[j]); | |
75 | printf ("\n"); | |
76 | } | |
77 | ||
78 | /* The initial column is for the case of an empty source string; we | |
79 | can reach prefixes of the target string of length i | |
80 | by inserting i characters. */ | |
81 | v1[0] = i + 1; | |
82 | ||
83 | /* Build the rest of the row by considering neighbours to | |
84 | the north, west and northwest. */ | |
85 | for (int j = 0; j < len_s; j++) | |
86 | { | |
87 | edit_distance_t cost = (s[j] == t[i] ? 0 : 1); | |
88 | edit_distance_t deletion = v1[j] + 1; | |
89 | edit_distance_t insertion = v0[j + 1] + 1; | |
90 | edit_distance_t substitution = v0[j] + cost; | |
91 | edit_distance_t cheapest = MIN (deletion, insertion); | |
92 | cheapest = MIN (cheapest, substitution); | |
93 | v1[j + 1] = cheapest; | |
94 | } | |
95 | ||
96 | /* Prepare to move on to next row. */ | |
97 | for (int j = 0; j < len_s + 1; j++) | |
98 | v0[j] = v1[j]; | |
99 | } | |
100 | ||
101 | if (debug) | |
102 | { | |
103 | printf ("final v1 = "); | |
104 | for (int j = 0; j < len_s + 1; j++) | |
105 | printf ("%i ", v1[j]); | |
106 | printf ("\n"); | |
107 | } | |
108 | ||
109 | edit_distance_t result = v1[len_s]; | |
110 | delete[] v0; | |
111 | delete[] v1; | |
112 | return result; | |
113 | } | |
114 | ||
b279775f | 115 | /* Calculate Levenshtein distance between two nil-terminated strings. */ |
b67f0560 | 116 | |
117 | edit_distance_t | |
118 | levenshtein_distance (const char *s, const char *t) | |
119 | { | |
120 | return levenshtein_distance (s, strlen (s), t, strlen (t)); | |
121 | } |