]>
Commit | Line | Data |
---|---|---|
93ebf1fd | 1 | /* Find near-matches for strings. |
85ec4feb | 2 | Copyright (C) 2015-2018 Free Software Foundation, Inc. |
277fe616 DM |
3 | |
4 | This file is part of GCC. | |
5 | ||
6 | GCC is free software; you can redistribute it and/or modify it under | |
7 | the terms of the GNU General Public License as published by the Free | |
8 | Software Foundation; either version 3, or (at your option) any later | |
9 | version. | |
10 | ||
11 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
12 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 | for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with GCC; see the file COPYING3. If not see | |
18 | <http://www.gnu.org/licenses/>. */ | |
19 | ||
20 | #include "config.h" | |
21 | #include "system.h" | |
22 | #include "coretypes.h" | |
23 | #include "tm.h" | |
24 | #include "tree.h" | |
25 | #include "spellcheck.h" | |
d9b950dd | 26 | #include "selftest.h" |
277fe616 DM |
27 | |
28 | /* The Levenshtein distance is an "edit-distance": the minimal | |
29 | number of one-character insertions, removals or substitutions | |
30 | that are needed to change one string into another. | |
31 | ||
32 | This implementation uses the Wagner-Fischer algorithm. */ | |
33 | ||
93ebf1fd | 34 | edit_distance_t |
277fe616 DM |
35 | levenshtein_distance (const char *s, int len_s, |
36 | const char *t, int len_t) | |
37 | { | |
38 | const bool debug = false; | |
39 | ||
40 | if (debug) | |
41 | { | |
42 | printf ("s: \"%s\" (len_s=%i)\n", s, len_s); | |
43 | printf ("t: \"%s\" (len_t=%i)\n", t, len_t); | |
44 | } | |
45 | ||
46 | if (len_s == 0) | |
47 | return len_t; | |
48 | if (len_t == 0) | |
49 | return len_s; | |
50 | ||
51 | /* We effectively build a matrix where each (i, j) contains the | |
52 | Levenshtein distance between the prefix strings s[0:j] | |
53 | and t[0:i]. | |
54 | Rather than actually build an (len_t + 1) * (len_s + 1) matrix, | |
55 | we simply keep track of the last row, v0 and a new row, v1, | |
56 | which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses | |
57 | in favor of two (len_s + 1) allocations. These could potentially be | |
58 | statically-allocated if we impose a maximum length on the | |
59 | strings of interest. */ | |
60 | edit_distance_t *v0 = new edit_distance_t[len_s + 1]; | |
61 | edit_distance_t *v1 = new edit_distance_t[len_s + 1]; | |
62 | ||
63 | /* The first row is for the case of an empty target string, which | |
64 | we can reach by deleting every character in the source string. */ | |
65 | for (int i = 0; i < len_s + 1; i++) | |
66 | v0[i] = i; | |
67 | ||
68 | /* Build successive rows. */ | |
69 | for (int i = 0; i < len_t; i++) | |
70 | { | |
71 | if (debug) | |
72 | { | |
73 | printf ("i:%i v0 = ", i); | |
74 | for (int j = 0; j < len_s + 1; j++) | |
75 | printf ("%i ", v0[j]); | |
76 | printf ("\n"); | |
77 | } | |
78 | ||
79 | /* The initial column is for the case of an empty source string; we | |
80 | can reach prefixes of the target string of length i | |
81 | by inserting i characters. */ | |
82 | v1[0] = i + 1; | |
83 | ||
9c582551 | 84 | /* Build the rest of the row by considering neighbors to |
277fe616 DM |
85 | the north, west and northwest. */ |
86 | for (int j = 0; j < len_s; j++) | |
87 | { | |
88 | edit_distance_t cost = (s[j] == t[i] ? 0 : 1); | |
89 | edit_distance_t deletion = v1[j] + 1; | |
90 | edit_distance_t insertion = v0[j + 1] + 1; | |
91 | edit_distance_t substitution = v0[j] + cost; | |
92 | edit_distance_t cheapest = MIN (deletion, insertion); | |
93 | cheapest = MIN (cheapest, substitution); | |
94 | v1[j + 1] = cheapest; | |
95 | } | |
96 | ||
97 | /* Prepare to move on to next row. */ | |
98 | for (int j = 0; j < len_s + 1; j++) | |
99 | v0[j] = v1[j]; | |
100 | } | |
101 | ||
102 | if (debug) | |
103 | { | |
104 | printf ("final v1 = "); | |
105 | for (int j = 0; j < len_s + 1; j++) | |
106 | printf ("%i ", v1[j]); | |
107 | printf ("\n"); | |
108 | } | |
109 | ||
110 | edit_distance_t result = v1[len_s]; | |
111 | delete[] v0; | |
112 | delete[] v1; | |
113 | return result; | |
114 | } | |
115 | ||
93ebf1fd | 116 | /* Calculate Levenshtein distance between two nil-terminated strings. */ |
277fe616 DM |
117 | |
118 | edit_distance_t | |
119 | levenshtein_distance (const char *s, const char *t) | |
120 | { | |
121 | return levenshtein_distance (s, strlen (s), t, strlen (t)); | |
122 | } | |
61789eed DM |
123 | |
124 | /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to | |
125 | an autovec of non-NULL strings, determine which element within | |
126 | CANDIDATES has the lowest edit distance to TARGET. If there are | |
127 | multiple elements with the same minimal distance, the first in the | |
128 | vector wins. | |
129 | ||
130 | If more than half of the letters were misspelled, the suggestion is | |
131 | likely to be meaningless, so return NULL for this case. */ | |
132 | ||
133 | const char * | |
134 | find_closest_string (const char *target, | |
135 | const auto_vec<const char *> *candidates) | |
136 | { | |
137 | gcc_assert (target); | |
138 | gcc_assert (candidates); | |
139 | ||
140 | int i; | |
141 | const char *candidate; | |
6a3f203c | 142 | best_match<const char *, const char *> bm (target); |
61789eed DM |
143 | FOR_EACH_VEC_ELT (*candidates, i, candidate) |
144 | { | |
145 | gcc_assert (candidate); | |
6a3f203c | 146 | bm.consider (candidate); |
61789eed DM |
147 | } |
148 | ||
6a3f203c | 149 | return bm.get_best_meaningful_candidate (); |
61789eed | 150 | } |
d9b950dd DM |
151 | |
152 | #if CHECKING_P | |
153 | ||
154 | namespace selftest { | |
155 | ||
156 | /* Selftests. */ | |
157 | ||
158 | /* Verify that the levenshtein_distance (A, B) equals the expected | |
159 | value. */ | |
160 | ||
161 | static void | |
162 | levenshtein_distance_unit_test_oneway (const char *a, const char *b, | |
163 | edit_distance_t expected) | |
164 | { | |
165 | edit_distance_t actual = levenshtein_distance (a, b); | |
166 | ASSERT_EQ (actual, expected); | |
167 | } | |
168 | ||
169 | /* Verify that both | |
170 | levenshtein_distance (A, B) | |
171 | and | |
172 | levenshtein_distance (B, A) | |
173 | equal the expected value, to ensure that the function is symmetric. */ | |
174 | ||
175 | static void | |
176 | levenshtein_distance_unit_test (const char *a, const char *b, | |
177 | edit_distance_t expected) | |
178 | { | |
179 | levenshtein_distance_unit_test_oneway (a, b, expected); | |
180 | levenshtein_distance_unit_test_oneway (b, a, expected); | |
181 | } | |
182 | ||
484b59c4 DM |
183 | /* Verify that find_closest_string is sane. */ |
184 | ||
185 | static void | |
186 | test_find_closest_string () | |
187 | { | |
188 | auto_vec<const char *> candidates; | |
189 | ||
190 | /* Verify that it can handle an empty vec. */ | |
191 | ASSERT_EQ (NULL, find_closest_string ("", &candidates)); | |
192 | ||
193 | /* Verify that it works sanely for non-empty vecs. */ | |
194 | candidates.safe_push ("apple"); | |
195 | candidates.safe_push ("banana"); | |
196 | candidates.safe_push ("cherry"); | |
197 | ||
198 | ASSERT_STREQ ("apple", find_closest_string ("app", &candidates)); | |
199 | ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates)); | |
200 | ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates)); | |
201 | ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates)); | |
f254671f DM |
202 | |
203 | /* The order of the vec can matter, but it should not matter for these | |
204 | inputs. */ | |
205 | candidates.truncate (0); | |
206 | candidates.safe_push ("cherry"); | |
207 | candidates.safe_push ("banana"); | |
208 | candidates.safe_push ("apple"); | |
209 | ASSERT_STREQ ("apple", find_closest_string ("app", &candidates)); | |
210 | ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates)); | |
211 | ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates)); | |
212 | ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates)); | |
8bf3cdff DM |
213 | |
214 | /* If the goal string somehow makes it into the candidate list, offering | |
215 | it as a suggestion will be nonsensical. Verify that we don't offer such | |
216 | suggestions. */ | |
217 | ASSERT_EQ (NULL, find_closest_string ("banana", &candidates)); | |
f254671f DM |
218 | } |
219 | ||
220 | /* Test data for test_metric_conditions. */ | |
221 | ||
222 | static const char * const test_data[] = { | |
223 | "", | |
bc4519ed | 224 | "foo", |
f254671f DM |
225 | "food", |
226 | "boo", | |
227 | "1234567890123456789012345678901234567890123456789012345678901234567890" | |
228 | }; | |
229 | ||
230 | /* Verify that levenshtein_distance appears to be a sane distance function, | |
231 | i.e. the conditions for being a metric. This is done directly for a | |
232 | small set of examples, using test_data above. This is O(N^3) in the size | |
233 | of the array, due to the test for the triangle inequality, so we keep the | |
234 | array small. */ | |
235 | ||
236 | static void | |
237 | test_metric_conditions () | |
238 | { | |
239 | const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]); | |
240 | ||
241 | for (int i = 0; i < num_test_cases; i++) | |
242 | { | |
243 | for (int j = 0; j < num_test_cases; j++) | |
244 | { | |
245 | edit_distance_t dist_ij | |
246 | = levenshtein_distance (test_data[i], test_data[j]); | |
247 | ||
248 | /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */ | |
249 | if (i == j) | |
250 | ASSERT_EQ (dist_ij, 0); | |
251 | else | |
252 | ASSERT_TRUE (dist_ij > 0); | |
253 | ||
254 | /* Symmetry: d(i, j) == d(j, i). */ | |
255 | edit_distance_t dist_ji | |
256 | = levenshtein_distance (test_data[j], test_data[i]); | |
257 | ASSERT_EQ (dist_ij, dist_ji); | |
258 | ||
259 | /* Triangle inequality. */ | |
260 | for (int k = 0; k < num_test_cases; k++) | |
261 | { | |
262 | edit_distance_t dist_ik | |
263 | = levenshtein_distance (test_data[i], test_data[k]); | |
264 | edit_distance_t dist_jk | |
265 | = levenshtein_distance (test_data[j], test_data[k]); | |
266 | ASSERT_TRUE (dist_ik <= dist_ij + dist_jk); | |
267 | } | |
268 | } | |
269 | } | |
484b59c4 DM |
270 | } |
271 | ||
d9b950dd DM |
272 | /* Verify levenshtein_distance for a variety of pairs of pre-canned |
273 | inputs, comparing against known-good values. */ | |
274 | ||
275 | void | |
276 | spellcheck_c_tests () | |
277 | { | |
278 | levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty")); | |
279 | levenshtein_distance_unit_test ("saturday", "sunday", 3); | |
280 | levenshtein_distance_unit_test ("foo", "m_foo", 2); | |
281 | levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3); | |
282 | levenshtein_distance_unit_test | |
283 | ("the quick brown fox jumps over the lazy dog", "dog", 40); | |
284 | levenshtein_distance_unit_test | |
285 | ("the quick brown fox jumps over the lazy dog", | |
286 | "the quick brown dog jumps over the lazy fox", | |
287 | 4); | |
288 | levenshtein_distance_unit_test | |
289 | ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,", | |
290 | "All your base are belong to us", | |
291 | 44); | |
f254671f | 292 | levenshtein_distance_unit_test ("foo", "FOO", 3); |
484b59c4 DM |
293 | |
294 | test_find_closest_string (); | |
f254671f | 295 | test_metric_conditions (); |
d9b950dd DM |
296 | } |
297 | ||
298 | } // namespace selftest | |
299 | ||
300 | #endif /* #if CHECKING_P */ |