]>
Commit | Line | Data |
---|---|---|
93ebf1fd | 1 | /* Find near-matches for strings. |
85ec4feb | 2 | Copyright (C) 2015-2018 Free Software Foundation, Inc. |
277fe616 DM |
3 | |
4 | This file is part of GCC. | |
5 | ||
6 | GCC is free software; you can redistribute it and/or modify it under | |
7 | the terms of the GNU General Public License as published by the Free | |
8 | Software Foundation; either version 3, or (at your option) any later | |
9 | version. | |
10 | ||
11 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
12 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 | for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with GCC; see the file COPYING3. If not see | |
18 | <http://www.gnu.org/licenses/>. */ | |
19 | ||
20 | #include "config.h" | |
21 | #include "system.h" | |
22 | #include "coretypes.h" | |
23 | #include "tm.h" | |
24 | #include "tree.h" | |
25 | #include "spellcheck.h" | |
d9b950dd | 26 | #include "selftest.h" |
277fe616 | 27 | |
b80a188b DM |
28 | /* Get the edit distance between the two strings: the minimal |
29 | number of edits that are needed to change one string into another, | |
30 | where edits can be one-character insertions, removals, or substitutions, | |
31 | or transpositions of two adjacent characters (counting as one "edit"). | |
277fe616 | 32 | |
b80a188b DM |
33 | This implementation uses the Wagner-Fischer algorithm for the |
34 | Damerau-Levenshtein distance; specifically, the "optimal string alignment | |
35 | distance" or "restricted edit distance" variant. */ | |
277fe616 | 36 | |
93ebf1fd | 37 | edit_distance_t |
b80a188b DM |
38 | get_edit_distance (const char *s, int len_s, |
39 | const char *t, int len_t) | |
277fe616 DM |
40 | { |
41 | const bool debug = false; | |
42 | ||
43 | if (debug) | |
44 | { | |
45 | printf ("s: \"%s\" (len_s=%i)\n", s, len_s); | |
46 | printf ("t: \"%s\" (len_t=%i)\n", t, len_t); | |
47 | } | |
48 | ||
49 | if (len_s == 0) | |
50 | return len_t; | |
51 | if (len_t == 0) | |
52 | return len_s; | |
53 | ||
54 | /* We effectively build a matrix where each (i, j) contains the | |
b80a188b | 55 | distance between the prefix strings s[0:j] and t[0:i]. |
277fe616 | 56 | Rather than actually build an (len_t + 1) * (len_s + 1) matrix, |
b80a188b DM |
57 | we simply keep track of the last two rows, v_one_ago and v_two_ago, |
58 | and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1) | |
59 | allocation and memory accesses in favor of three (len_s + 1) | |
60 | allocations. These could potentially be | |
277fe616 DM |
61 | statically-allocated if we impose a maximum length on the |
62 | strings of interest. */ | |
b80a188b DM |
63 | edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1]; |
64 | edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1]; | |
65 | edit_distance_t *v_next = new edit_distance_t[len_s + 1]; | |
277fe616 DM |
66 | |
67 | /* The first row is for the case of an empty target string, which | |
68 | we can reach by deleting every character in the source string. */ | |
69 | for (int i = 0; i < len_s + 1; i++) | |
b80a188b | 70 | v_one_ago[i] = i; |
277fe616 DM |
71 | |
72 | /* Build successive rows. */ | |
73 | for (int i = 0; i < len_t; i++) | |
74 | { | |
75 | if (debug) | |
76 | { | |
b80a188b | 77 | printf ("i:%i v_one_ago = ", i); |
277fe616 | 78 | for (int j = 0; j < len_s + 1; j++) |
b80a188b | 79 | printf ("%i ", v_one_ago[j]); |
277fe616 DM |
80 | printf ("\n"); |
81 | } | |
82 | ||
83 | /* The initial column is for the case of an empty source string; we | |
84 | can reach prefixes of the target string of length i | |
85 | by inserting i characters. */ | |
b80a188b | 86 | v_next[0] = i + 1; |
277fe616 | 87 | |
9c582551 | 88 | /* Build the rest of the row by considering neighbors to |
277fe616 DM |
89 | the north, west and northwest. */ |
90 | for (int j = 0; j < len_s; j++) | |
91 | { | |
92 | edit_distance_t cost = (s[j] == t[i] ? 0 : 1); | |
b80a188b DM |
93 | edit_distance_t deletion = v_next[j] + 1; |
94 | edit_distance_t insertion = v_one_ago[j + 1] + 1; | |
95 | edit_distance_t substitution = v_one_ago[j] + cost; | |
277fe616 DM |
96 | edit_distance_t cheapest = MIN (deletion, insertion); |
97 | cheapest = MIN (cheapest, substitution); | |
b80a188b DM |
98 | if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i]) |
99 | { | |
100 | edit_distance_t transposition = v_two_ago[j - 1] + 1; | |
101 | cheapest = MIN (cheapest, transposition); | |
102 | } | |
103 | v_next[j + 1] = cheapest; | |
277fe616 DM |
104 | } |
105 | ||
106 | /* Prepare to move on to next row. */ | |
107 | for (int j = 0; j < len_s + 1; j++) | |
b80a188b DM |
108 | { |
109 | v_two_ago[j] = v_one_ago[j]; | |
110 | v_one_ago[j] = v_next[j]; | |
111 | } | |
277fe616 DM |
112 | } |
113 | ||
114 | if (debug) | |
115 | { | |
b80a188b | 116 | printf ("final v_next = "); |
277fe616 | 117 | for (int j = 0; j < len_s + 1; j++) |
b80a188b | 118 | printf ("%i ", v_next[j]); |
277fe616 DM |
119 | printf ("\n"); |
120 | } | |
121 | ||
b80a188b DM |
122 | edit_distance_t result = v_next[len_s]; |
123 | delete[] v_two_ago; | |
124 | delete[] v_one_ago; | |
125 | delete[] v_next; | |
277fe616 DM |
126 | return result; |
127 | } | |
128 | ||
b80a188b | 129 | /* Get the edit distance between two nil-terminated strings. */ |
277fe616 DM |
130 | |
131 | edit_distance_t | |
b80a188b | 132 | get_edit_distance (const char *s, const char *t) |
277fe616 | 133 | { |
b80a188b | 134 | return get_edit_distance (s, strlen (s), t, strlen (t)); |
277fe616 | 135 | } |
61789eed DM |
136 | |
137 | /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to | |
138 | an autovec of non-NULL strings, determine which element within | |
139 | CANDIDATES has the lowest edit distance to TARGET. If there are | |
140 | multiple elements with the same minimal distance, the first in the | |
141 | vector wins. | |
142 | ||
143 | If more than half of the letters were misspelled, the suggestion is | |
144 | likely to be meaningless, so return NULL for this case. */ | |
145 | ||
146 | const char * | |
147 | find_closest_string (const char *target, | |
148 | const auto_vec<const char *> *candidates) | |
149 | { | |
150 | gcc_assert (target); | |
151 | gcc_assert (candidates); | |
152 | ||
153 | int i; | |
154 | const char *candidate; | |
6a3f203c | 155 | best_match<const char *, const char *> bm (target); |
61789eed DM |
156 | FOR_EACH_VEC_ELT (*candidates, i, candidate) |
157 | { | |
158 | gcc_assert (candidate); | |
6a3f203c | 159 | bm.consider (candidate); |
61789eed DM |
160 | } |
161 | ||
6a3f203c | 162 | return bm.get_best_meaningful_candidate (); |
61789eed | 163 | } |
d9b950dd DM |
164 | |
165 | #if CHECKING_P | |
166 | ||
167 | namespace selftest { | |
168 | ||
169 | /* Selftests. */ | |
170 | ||
b80a188b | 171 | /* Verify that get_edit_distance (A, B) equals the expected value. */ |
d9b950dd DM |
172 | |
173 | static void | |
b80a188b DM |
174 | test_edit_distance_unit_test_oneway (const char *a, const char *b, |
175 | edit_distance_t expected) | |
d9b950dd | 176 | { |
b80a188b | 177 | edit_distance_t actual = get_edit_distance (a, b); |
d9b950dd DM |
178 | ASSERT_EQ (actual, expected); |
179 | } | |
180 | ||
181 | /* Verify that both | |
b80a188b | 182 | get_edit_distance (A, B) |
d9b950dd | 183 | and |
b80a188b | 184 | get_edit_distance (B, A) |
d9b950dd DM |
185 | equal the expected value, to ensure that the function is symmetric. */ |
186 | ||
187 | static void | |
b80a188b DM |
188 | test_get_edit_distance_unit (const char *a, const char *b, |
189 | edit_distance_t expected) | |
d9b950dd | 190 | { |
b80a188b DM |
191 | test_edit_distance_unit_test_oneway (a, b, expected); |
192 | test_edit_distance_unit_test_oneway (b, a, expected); | |
d9b950dd DM |
193 | } |
194 | ||
484b59c4 DM |
195 | /* Verify that find_closest_string is sane. */ |
196 | ||
197 | static void | |
198 | test_find_closest_string () | |
199 | { | |
200 | auto_vec<const char *> candidates; | |
201 | ||
202 | /* Verify that it can handle an empty vec. */ | |
203 | ASSERT_EQ (NULL, find_closest_string ("", &candidates)); | |
204 | ||
205 | /* Verify that it works sanely for non-empty vecs. */ | |
206 | candidates.safe_push ("apple"); | |
207 | candidates.safe_push ("banana"); | |
208 | candidates.safe_push ("cherry"); | |
209 | ||
210 | ASSERT_STREQ ("apple", find_closest_string ("app", &candidates)); | |
211 | ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates)); | |
212 | ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates)); | |
213 | ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates)); | |
f254671f DM |
214 | |
215 | /* The order of the vec can matter, but it should not matter for these | |
216 | inputs. */ | |
217 | candidates.truncate (0); | |
218 | candidates.safe_push ("cherry"); | |
219 | candidates.safe_push ("banana"); | |
220 | candidates.safe_push ("apple"); | |
221 | ASSERT_STREQ ("apple", find_closest_string ("app", &candidates)); | |
222 | ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates)); | |
223 | ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates)); | |
224 | ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates)); | |
8bf3cdff DM |
225 | |
226 | /* If the goal string somehow makes it into the candidate list, offering | |
227 | it as a suggestion will be nonsensical. Verify that we don't offer such | |
228 | suggestions. */ | |
229 | ASSERT_EQ (NULL, find_closest_string ("banana", &candidates)); | |
b80a188b DM |
230 | |
231 | /* Example from PR 69968 where transposition helps. */ | |
232 | candidates.truncate (0); | |
233 | candidates.safe_push("coordx"); | |
234 | candidates.safe_push("coordy"); | |
235 | candidates.safe_push("coordz"); | |
236 | candidates.safe_push("coordx1"); | |
237 | candidates.safe_push("coordy1"); | |
238 | candidates.safe_push("coordz1"); | |
239 | ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates)); | |
f254671f DM |
240 | } |
241 | ||
242 | /* Test data for test_metric_conditions. */ | |
243 | ||
244 | static const char * const test_data[] = { | |
245 | "", | |
bc4519ed | 246 | "foo", |
f254671f DM |
247 | "food", |
248 | "boo", | |
249 | "1234567890123456789012345678901234567890123456789012345678901234567890" | |
250 | }; | |
251 | ||
b80a188b | 252 | /* Verify that get_edit_distance appears to be a sane distance function, |
f254671f DM |
253 | i.e. the conditions for being a metric. This is done directly for a |
254 | small set of examples, using test_data above. This is O(N^3) in the size | |
255 | of the array, due to the test for the triangle inequality, so we keep the | |
256 | array small. */ | |
257 | ||
258 | static void | |
259 | test_metric_conditions () | |
260 | { | |
261 | const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]); | |
262 | ||
263 | for (int i = 0; i < num_test_cases; i++) | |
264 | { | |
265 | for (int j = 0; j < num_test_cases; j++) | |
266 | { | |
267 | edit_distance_t dist_ij | |
b80a188b | 268 | = get_edit_distance (test_data[i], test_data[j]); |
f254671f DM |
269 | |
270 | /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */ | |
271 | if (i == j) | |
272 | ASSERT_EQ (dist_ij, 0); | |
273 | else | |
274 | ASSERT_TRUE (dist_ij > 0); | |
275 | ||
276 | /* Symmetry: d(i, j) == d(j, i). */ | |
277 | edit_distance_t dist_ji | |
b80a188b | 278 | = get_edit_distance (test_data[j], test_data[i]); |
f254671f DM |
279 | ASSERT_EQ (dist_ij, dist_ji); |
280 | ||
281 | /* Triangle inequality. */ | |
282 | for (int k = 0; k < num_test_cases; k++) | |
283 | { | |
284 | edit_distance_t dist_ik | |
b80a188b | 285 | = get_edit_distance (test_data[i], test_data[k]); |
f254671f | 286 | edit_distance_t dist_jk |
b80a188b | 287 | = get_edit_distance (test_data[j], test_data[k]); |
f254671f DM |
288 | ASSERT_TRUE (dist_ik <= dist_ij + dist_jk); |
289 | } | |
290 | } | |
291 | } | |
484b59c4 DM |
292 | } |
293 | ||
b80a188b | 294 | /* Verify get_edit_distance for a variety of pairs of pre-canned |
d9b950dd DM |
295 | inputs, comparing against known-good values. */ |
296 | ||
297 | void | |
298 | spellcheck_c_tests () | |
299 | { | |
b80a188b DM |
300 | test_get_edit_distance_unit ("", "nonempty", strlen ("nonempty")); |
301 | test_get_edit_distance_unit ("saturday", "sunday", 3); | |
302 | test_get_edit_distance_unit ("foo", "m_foo", 2); | |
303 | test_get_edit_distance_unit ("hello_world", "HelloWorld", 3); | |
304 | test_get_edit_distance_unit | |
d9b950dd | 305 | ("the quick brown fox jumps over the lazy dog", "dog", 40); |
b80a188b | 306 | test_get_edit_distance_unit |
d9b950dd DM |
307 | ("the quick brown fox jumps over the lazy dog", |
308 | "the quick brown dog jumps over the lazy fox", | |
309 | 4); | |
b80a188b | 310 | test_get_edit_distance_unit |
d9b950dd DM |
311 | ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,", |
312 | "All your base are belong to us", | |
313 | 44); | |
b80a188b DM |
314 | test_get_edit_distance_unit ("foo", "FOO", 3); |
315 | test_get_edit_distance_unit ("fee", "deed", 2); | |
316 | test_get_edit_distance_unit ("coorzd1", "coordx1", 2); | |
317 | ||
318 | /* Examples where transposition helps. */ | |
319 | test_get_edit_distance_unit ("ab", "ba", 1); | |
320 | test_get_edit_distance_unit ("ba", "abc", 2); | |
321 | test_get_edit_distance_unit ("coorzd1", "coordz1", 1); | |
322 | test_get_edit_distance_unit ("abcdefghijklmnopqrstuvwxyz", | |
323 | "bacdefghijklmnopqrstuvwxzy", 2); | |
324 | test_get_edit_distance_unit ("saturday", "sundya", 4); | |
325 | test_get_edit_distance_unit ("signed", "singed", 1); | |
484b59c4 DM |
326 | |
327 | test_find_closest_string (); | |
f254671f | 328 | test_metric_conditions (); |
d9b950dd DM |
329 | } |
330 | ||
331 | } // namespace selftest | |
332 | ||
333 | #endif /* #if CHECKING_P */ |