]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/spellcheck.c
[Ada] Update headers
[thirdparty/gcc.git] / gcc / spellcheck.c
1 /* Find near-matches for strings.
2 Copyright (C) 2015-2020 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "tree.h"
25 #include "spellcheck.h"
26 #include "selftest.h"
27
28 /* Cost of a case transformation. */
29 #define CASE_COST 1
30
31 /* Cost of another kind of edit. */
32 #define BASE_COST 2
33
34 /* Get the edit distance between the two strings: the minimal
35 number of edits that are needed to change one string into another,
36 where edits can be one-character insertions, removals, or substitutions,
37 or transpositions of two adjacent characters (counting as one "edit").
38
39 This implementation uses a modified variant of the Wagner-Fischer
40 algorithm for the Damerau-Levenshtein distance; specifically, the
41 "optimal string alignment distance" or "restricted edit distance"
42 variant. This implementation has been further modified to take
43 case into account. */
44
45 edit_distance_t
46 get_edit_distance (const char *s, int len_s,
47 const char *t, int len_t)
48 {
49 const bool debug = false;
50
51 if (debug)
52 {
53 printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
54 printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
55 }
56
57 if (len_s == 0)
58 return BASE_COST * len_t;
59 if (len_t == 0)
60 return BASE_COST * len_s;
61
62 /* We effectively build a matrix where each (i, j) contains the
63 distance between the prefix strings s[0:j] and t[0:i].
64 Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
65 we simply keep track of the last two rows, v_one_ago and v_two_ago,
66 and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
67 allocation and memory accesses in favor of three (len_s + 1)
68 allocations. These could potentially be
69 statically-allocated if we impose a maximum length on the
70 strings of interest. */
71 edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
72 edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
73 edit_distance_t *v_next = new edit_distance_t[len_s + 1];
74
75 /* The first row is for the case of an empty target string, which
76 we can reach by deleting every character in the source string. */
77 for (int i = 0; i < len_s + 1; i++)
78 v_one_ago[i] = i * BASE_COST;
79
80 /* Build successive rows. */
81 for (int i = 0; i < len_t; i++)
82 {
83 if (debug)
84 {
85 printf ("i:%i v_one_ago = ", i);
86 for (int j = 0; j < len_s + 1; j++)
87 printf ("%i ", v_one_ago[j]);
88 printf ("\n");
89 }
90
91 /* The initial column is for the case of an empty source string; we
92 can reach prefixes of the target string of length i
93 by inserting i characters. */
94 v_next[0] = (i + 1) * BASE_COST;
95
96 /* Build the rest of the row by considering neighbors to
97 the north, west and northwest. */
98 for (int j = 0; j < len_s; j++)
99 {
100 edit_distance_t cost;
101
102 if (s[j] == t[i])
103 cost = 0;
104 else if (TOLOWER (s[j]) == TOLOWER (t[i]))
105 cost = CASE_COST;
106 else
107 cost = BASE_COST;
108 edit_distance_t deletion = v_next[j] + BASE_COST;
109 edit_distance_t insertion = v_one_ago[j + 1] + BASE_COST;
110 edit_distance_t substitution = v_one_ago[j] + cost;
111 edit_distance_t cheapest = MIN (deletion, insertion);
112 cheapest = MIN (cheapest, substitution);
113 if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
114 {
115 edit_distance_t transposition = v_two_ago[j - 1] + BASE_COST;
116 cheapest = MIN (cheapest, transposition);
117 }
118 v_next[j + 1] = cheapest;
119 }
120
121 /* Prepare to move on to next row. */
122 for (int j = 0; j < len_s + 1; j++)
123 {
124 v_two_ago[j] = v_one_ago[j];
125 v_one_ago[j] = v_next[j];
126 }
127 }
128
129 if (debug)
130 {
131 printf ("final v_next = ");
132 for (int j = 0; j < len_s + 1; j++)
133 printf ("%i ", v_next[j]);
134 printf ("\n");
135 }
136
137 edit_distance_t result = v_next[len_s];
138 delete[] v_two_ago;
139 delete[] v_one_ago;
140 delete[] v_next;
141 return result;
142 }
143
144 /* Get the edit distance between two nil-terminated strings. */
145
146 edit_distance_t
147 get_edit_distance (const char *s, const char *t)
148 {
149 return get_edit_distance (s, strlen (s), t, strlen (t));
150 }
151
152 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
153 an autovec of non-NULL strings, determine which element within
154 CANDIDATES has the lowest edit distance to TARGET. If there are
155 multiple elements with the same minimal distance, the first in the
156 vector wins.
157
158 If more than half of the letters were misspelled, the suggestion is
159 likely to be meaningless, so return NULL for this case. */
160
161 const char *
162 find_closest_string (const char *target,
163 const auto_vec<const char *> *candidates)
164 {
165 gcc_assert (target);
166 gcc_assert (candidates);
167
168 int i;
169 const char *candidate;
170 best_match<const char *, const char *> bm (target);
171 FOR_EACH_VEC_ELT (*candidates, i, candidate)
172 {
173 gcc_assert (candidate);
174 bm.consider (candidate);
175 }
176
177 return bm.get_best_meaningful_candidate ();
178 }
179
180 /* Generate the maximum edit distance for which we consider a suggestion
181 to be meaningful, given a goal of length GOAL_LEN and a candidate of
182 length CANDIDATE_LEN.
183
184 This is a third of the length of the candidate or of the goal,
185 whichever is bigger. */
186
187 edit_distance_t
188 get_edit_distance_cutoff (size_t goal_len, size_t candidate_len)
189 {
190 size_t max_length = MAX (goal_len, candidate_len);
191 size_t min_length = MIN (goal_len, candidate_len);
192
193 gcc_assert (max_length >= min_length);
194
195 /* Special case: don't offer suggestions for a pair of
196 length == 1 strings (or empty strings). */
197 if (max_length <= 1)
198 return 0;
199
200 /* If the lengths are close, then round down. */
201 if (max_length - min_length <= 1)
202 /* ...but allow an edit distance of at least 1. */
203 return BASE_COST * MAX (max_length / 3, 1);
204
205 /* Otherwise, round up (thus giving a little extra leeway to some cases
206 involving insertions/deletions). */
207 return BASE_COST * (max_length + 2) / 3;
208 }
209
210 #if CHECKING_P
211
212 namespace selftest {
213
214 /* Selftests. */
215
216 /* Verify that get_edit_distance (A, B) equals the expected value. */
217
218 static void
219 test_get_edit_distance_one_way (const char *a, const char *b,
220 edit_distance_t expected)
221 {
222 edit_distance_t actual = get_edit_distance (a, b);
223 ASSERT_EQ (actual, expected);
224 }
225
226 /* Verify that both
227 get_edit_distance (A, B)
228 and
229 get_edit_distance (B, A)
230 equal the expected value, to ensure that the function is symmetric. */
231
232 static void
233 test_get_edit_distance_both_ways (const char *a, const char *b,
234 edit_distance_t expected)
235 {
236 test_get_edit_distance_one_way (a, b, expected);
237 test_get_edit_distance_one_way (b, a, expected);
238 }
239
240 /* Verify get_edit_distance for a variety of pairs of pre-canned
241 inputs, comparing against known-good values. */
242
243 static void
244 test_edit_distances ()
245 {
246 test_get_edit_distance_both_ways ("", "nonempty",
247 BASE_COST * strlen ("nonempty"));
248 test_get_edit_distance_both_ways ("saturday", "sunday",
249 BASE_COST * 3);
250 test_get_edit_distance_both_ways ("foo", "m_foo", BASE_COST * 2);
251 test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 4);
252 test_get_edit_distance_both_ways
253 ("the quick brown fox jumps over the lazy dog", "dog", BASE_COST * 40);
254 test_get_edit_distance_both_ways
255 ("the quick brown fox jumps over the lazy dog",
256 "the quick brown dog jumps over the lazy fox",
257 BASE_COST * 4);
258 test_get_edit_distance_both_ways
259 ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
260 "All your base are belong to us",
261 BASE_COST * 44);
262 test_get_edit_distance_both_ways ("foo", "FOO", 3);
263 test_get_edit_distance_both_ways ("fee", "deed", BASE_COST * 2);
264 test_get_edit_distance_both_ways ("coorzd1", "coordx1", BASE_COST * 2);
265 test_get_edit_distance_both_ways ("assert", "sqrt", BASE_COST * 3);
266 test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", BASE_COST * 3);
267 test_get_edit_distance_both_ways ("time", "nice", BASE_COST * 2);
268 test_get_edit_distance_both_ways ("bar", "carg", BASE_COST * 2);
269 test_get_edit_distance_both_ways ("gtk_widget_show_all",
270 "GtkWidgetShowAll", 10);
271 test_get_edit_distance_both_ways ("m_bar", "bar", BASE_COST * 2);
272 test_get_edit_distance_both_ways ("MACRO", "MACRAME", BASE_COST * 3);
273 test_get_edit_distance_both_ways ("ab", "ac", BASE_COST * 1);
274 test_get_edit_distance_both_ways ("ab", "a", BASE_COST * 1);
275 test_get_edit_distance_both_ways ("a", "b", BASE_COST * 1);
276 test_get_edit_distance_both_ways ("nanl", "name", BASE_COST * 2);
277 test_get_edit_distance_both_ways ("char", "bar", BASE_COST * 2);
278 test_get_edit_distance_both_ways ("-optimize", "fsanitize", BASE_COST * 5);
279 test_get_edit_distance_both_ways ("__DATE__", "__i386__", BASE_COST * 4);
280
281 /* Examples where transposition helps. */
282 test_get_edit_distance_both_ways ("ab", "ba", BASE_COST * 1);
283 test_get_edit_distance_both_ways ("ba", "abc", BASE_COST * 2);
284 test_get_edit_distance_both_ways ("coorzd1", "coordz1", BASE_COST * 1);
285 test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz",
286 "bacdefghijklmnopqrstuvwxzy",
287 BASE_COST * 2);
288 test_get_edit_distance_both_ways ("saturday", "sundya", BASE_COST * 4);
289 test_get_edit_distance_both_ways ("signed", "singed", BASE_COST * 1);
290 }
291
292 /* Subroutine of test_get_edit_distance_cutoff, for emulating the
293 spellchecking cutoff in up to GCC 8. */
294
295 static edit_distance_t
296 get_old_cutoff (size_t goal_len, size_t candidate_len)
297 {
298 return BASE_COST * MAX (goal_len, candidate_len) / 2;
299 }
300
301 /* Verify that the cutoff for "meaningfulness" of suggestions is at least as
302 conservative as in older GCC releases.
303
304 This should ensure that we don't offer additional meaningless
305 suggestions (apart from those for which transposition has helped). */
306
307 static void
308 test_get_edit_distance_cutoff ()
309 {
310 for (size_t goal_len = 0; goal_len < 30; goal_len++)
311 for (size_t candidate_len = 0; candidate_len < 30; candidate_len++)
312 ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len)
313 <= get_old_cutoff (goal_len, candidate_len));
314 }
315
316 /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
317
318 static void
319 assert_suggested_for (const location &loc, const char *candidate,
320 const char *target)
321 {
322 auto_vec<const char *> candidates;
323 candidates.safe_push (candidate);
324 ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates));
325 }
326
327 /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
328
329 #define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET) \
330 SELFTEST_BEGIN_STMT \
331 assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
332 SELFTEST_END_STMT
333
334 /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
335
336 static void
337 assert_not_suggested_for (const location &loc, const char *candidate,
338 const char *target)
339 {
340 auto_vec<const char *> candidates;
341 candidates.safe_push (candidate);
342 ASSERT_EQ_AT (loc, NULL, find_closest_string (target, &candidates));
343 }
344
345 /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
346
347 #define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET) \
348 SELFTEST_BEGIN_STMT \
349 assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
350 SELFTEST_END_STMT
351
352 /* Verify that we offer varous suggestions that are meaningful,
353 and that we don't offer various other ones that aren't (PR c/82967). */
354
355 static void
356 test_suggestions ()
357 {
358 /* Good suggestions. */
359
360 ASSERT_SUGGESTED_FOR ("m_bar", "bar");
361 // dist == 2, max_length == 5, min_length == 3
362
363 ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
364 // dist == 3, max_length == 7, min_length == 5
365
366 ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
367 // dist == 7, max_length == 16, min_length = 19
368
369 ASSERT_SUGGESTED_FOR ("ab", "ac");
370 // dist == 1, max_length == min_length = 2
371
372 ASSERT_SUGGESTED_FOR ("ab", "a");
373 // dist == 1, max_length == 2, min_length = 1
374
375 /* Bad suggestions. */
376
377 ASSERT_NOT_SUGGESTED_FOR ("a", "b");
378 // dist == 1, max_length == min_length = 1
379
380 ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
381 // dist == 3, max_length 6, min_length == 4
382
383 ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
384 // dist == 3, max_length == min_length == 8
385
386 ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
387 ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
388 // dist == 2, max_length == min_length == 4
389
390 ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
391 ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
392 // dist == 2, max_length == 4, min_length == 3
393
394 ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
395 // dist == 5, max_length == min_length == 9
396
397 ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
398 // dist == 4, max_length == min_length == 8
399
400 ASSERT_NOT_SUGGESTED_FOR ("start_input_device", "InputDevice");
401 // dist == 9, max_length == 18, min_length == 11
402 }
403
404 /* Verify that find_closest_string is sane. */
405
406 static void
407 test_find_closest_string ()
408 {
409 auto_vec<const char *> candidates;
410
411 /* Verify that it can handle an empty vec. */
412 ASSERT_EQ (NULL, find_closest_string ("", &candidates));
413
414 /* Verify that it works sanely for non-empty vecs. */
415 candidates.safe_push ("apple");
416 candidates.safe_push ("banana");
417 candidates.safe_push ("cherry");
418
419 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
420 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
421 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
422 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
423
424 /* The order of the vec can matter, but it should not matter for these
425 inputs. */
426 candidates.truncate (0);
427 candidates.safe_push ("cherry");
428 candidates.safe_push ("banana");
429 candidates.safe_push ("apple");
430 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
431 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
432 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
433 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
434
435 /* If the goal string somehow makes it into the candidate list, offering
436 it as a suggestion will be nonsensical. Verify that we don't offer such
437 suggestions. */
438 ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
439
440 /* Example from PR 69968 where transposition helps. */
441 candidates.truncate (0);
442 candidates.safe_push("coordx");
443 candidates.safe_push("coordy");
444 candidates.safe_push("coordz");
445 candidates.safe_push("coordx1");
446 candidates.safe_push("coordy1");
447 candidates.safe_push("coordz1");
448 ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
449
450 candidates.truncate (0);
451 candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
452 candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
453 candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
454 ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
455 find_closest_string ("DWARF_GNAT_ENCODINGS_all",
456 &candidates));
457
458 /* The same as the previous test, but with a different order of
459 candidates. */
460 candidates.truncate (0);
461 candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
462 candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
463 candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
464 ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
465 find_closest_string ("DWARF_GNAT_ENCODINGS_all",
466 &candidates));
467 }
468
469 /* Test data for test_metric_conditions. */
470
471 static const char * const test_data[] = {
472 "",
473 "foo",
474 "food",
475 "boo",
476 "1234567890123456789012345678901234567890123456789012345678901234567890"
477 };
478
479 /* Verify that get_edit_distance appears to be a sane distance function,
480 i.e. the conditions for being a metric. This is done directly for a
481 small set of examples, using test_data above. This is O(N^3) in the size
482 of the array, due to the test for the triangle inequality, so we keep the
483 array small. */
484
485 static void
486 test_metric_conditions ()
487 {
488 const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]);
489
490 for (int i = 0; i < num_test_cases; i++)
491 {
492 for (int j = 0; j < num_test_cases; j++)
493 {
494 edit_distance_t dist_ij
495 = get_edit_distance (test_data[i], test_data[j]);
496
497 /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
498 if (i == j)
499 ASSERT_EQ (dist_ij, 0);
500 else
501 ASSERT_TRUE (dist_ij > 0);
502
503 /* Symmetry: d(i, j) == d(j, i). */
504 edit_distance_t dist_ji
505 = get_edit_distance (test_data[j], test_data[i]);
506 ASSERT_EQ (dist_ij, dist_ji);
507
508 /* Triangle inequality. */
509 for (int k = 0; k < num_test_cases; k++)
510 {
511 edit_distance_t dist_ik
512 = get_edit_distance (test_data[i], test_data[k]);
513 edit_distance_t dist_jk
514 = get_edit_distance (test_data[j], test_data[k]);
515 ASSERT_TRUE (dist_ik <= dist_ij + dist_jk);
516 }
517 }
518 }
519 }
520
521 /* Run all of the selftests within this file. */
522
523 void
524 spellcheck_c_tests ()
525 {
526 test_edit_distances ();
527 test_get_edit_distance_cutoff ();
528 test_suggestions ();
529 test_find_closest_string ();
530 test_metric_conditions ();
531 }
532
533 } // namespace selftest
534
535 #endif /* #if CHECKING_P */