]>
Commit | Line | Data |
---|---|---|
1c1af145 | 1 | /* |
2 | * Wildcard matching engine for use with SFTP-based file transfer | |
3 | * programs (PSFTP, new-look PSCP): since SFTP has no notion of | |
4 | * getting the remote side to do globbing (and rightly so) we have | |
5 | * to do it locally, by retrieving all the filenames in a directory | |
6 | * and checking each against the wildcard pattern. | |
7 | */ | |
8 | ||
9 | #include <assert.h> | |
10 | #include <stdlib.h> | |
11 | #include <string.h> | |
12 | ||
13 | #include "putty.h" | |
14 | ||
15 | /* | |
16 | * Definition of wildcard syntax: | |
17 | * | |
18 | * - * matches any sequence of characters, including zero. | |
19 | * - ? matches exactly one character which can be anything. | |
20 | * - [abc] matches exactly one character which is a, b or c. | |
21 | * - [a-f] matches anything from a through f. | |
22 | * - [^a-f] matches anything _except_ a through f. | |
23 | * - [-_] matches - or _; [^-_] matches anything else. (The - is | |
24 | * non-special if it occurs immediately after the opening | |
25 | * bracket or ^.) | |
26 | * - [a^] matches an a or a ^. (The ^ is non-special if it does | |
27 | * _not_ occur immediately after the opening bracket.) | |
28 | * - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \. | |
29 | * - All other characters are non-special and match themselves. | |
30 | */ | |
31 | ||
32 | /* | |
33 | * Some notes on differences from POSIX globs (IEEE Std 1003.1, 2003 ed.): | |
34 | * - backslashes act as escapes even within [] bracket expressions | |
35 | * - does not support [!...] for non-matching list (POSIX are weird); | |
36 | * NB POSIX allows [^...] as well via "A bracket expression starting | |
37 | * with an unquoted circumflex character produces unspecified | |
38 | * results". If we wanted to allow [!...] we might want to define | |
39 | * [^!] as having its literal meaning (match '^' or '!'). | |
40 | * - none of the scary [[:class:]] stuff, etc | |
41 | */ | |
42 | ||
43 | /* | |
44 | * The wildcard matching technique we use is very simple and | |
45 | * potentially O(N^2) in running time, but I don't anticipate it | |
46 | * being that bad in reality (particularly since N will be the size | |
47 | * of a filename, which isn't all that much). Perhaps one day, once | |
48 | * PuTTY has grown a regexp matcher for some other reason, I might | |
49 | * come back and reimplement wildcards by translating them into | |
50 | * regexps or directly into NFAs; but for the moment, in the | |
51 | * absence of any other need for the NFA->DFA translation engine, | |
52 | * anything more than the simplest possible wildcard matcher is | |
53 | * vast code-size overkill. | |
54 | * | |
55 | * Essentially, these wildcards are much simpler than regexps in | |
56 | * that they consist of a sequence of rigid fragments (? and [...] | |
57 | * can never match more or less than one character) separated by | |
58 | * asterisks. It is therefore extremely simple to look at a rigid | |
59 | * fragment and determine whether or not it begins at a particular | |
60 | * point in the test string; so we can search along the string | |
61 | * until we find each fragment, then search for the next. As long | |
62 | * as we find each fragment in the _first_ place it occurs, there | |
63 | * will never be a danger of having to backpedal and try to find it | |
64 | * again somewhere else. | |
65 | */ | |
66 | ||
67 | enum { | |
68 | WC_TRAILINGBACKSLASH = 1, | |
69 | WC_UNCLOSEDCLASS, | |
70 | WC_INVALIDRANGE | |
71 | }; | |
72 | ||
73 | /* | |
74 | * Error reporting is done by returning various negative values | |
75 | * from the wildcard routines. Passing any such value to wc_error | |
76 | * will give a human-readable message. | |
77 | */ | |
78 | const char *wc_error(int value) | |
79 | { | |
80 | value = abs(value); | |
81 | switch (value) { | |
82 | case WC_TRAILINGBACKSLASH: | |
83 | return "'\' occurred at end of string (expected another character)"; | |
84 | case WC_UNCLOSEDCLASS: | |
85 | return "expected ']' to close character class"; | |
86 | case WC_INVALIDRANGE: | |
87 | return "character range was not terminated (']' just after '-')"; | |
88 | } | |
89 | return "INTERNAL ERROR: unrecognised wildcard error number"; | |
90 | } | |
91 | ||
92 | /* | |
93 | * This is the routine that tests a target string to see if an | |
94 | * initial substring of it matches a fragment. If successful, it | |
95 | * returns 1, and advances both `fragment' and `target' past the | |
96 | * fragment and matching substring respectively. If unsuccessful it | |
97 | * returns zero. If the wildcard fragment suffers a syntax error, | |
98 | * it returns <0 and the precise value indexes into wc_error. | |
99 | */ | |
100 | static int wc_match_fragment(const char **fragment, const char **target) | |
101 | { | |
102 | const char *f, *t; | |
103 | ||
104 | f = *fragment; | |
105 | t = *target; | |
106 | /* | |
107 | * The fragment terminates at either the end of the string, or | |
108 | * the first (unescaped) *. | |
109 | */ | |
110 | while (*f && *f != '*' && *t) { | |
111 | /* | |
112 | * Extract one character from t, and one character's worth | |
113 | * of pattern from f, and step along both. Return 0 if they | |
114 | * fail to match. | |
115 | */ | |
116 | if (*f == '\\') { | |
117 | /* | |
118 | * Backslash, which means f[1] is to be treated as a | |
119 | * literal character no matter what it is. It may not | |
120 | * be the end of the string. | |
121 | */ | |
122 | if (!f[1]) | |
123 | return -WC_TRAILINGBACKSLASH; /* error */ | |
124 | if (f[1] != *t) | |
125 | return 0; /* failed to match */ | |
126 | f += 2; | |
127 | } else if (*f == '?') { | |
128 | /* | |
129 | * Question mark matches anything. | |
130 | */ | |
131 | f++; | |
132 | } else if (*f == '[') { | |
133 | int invert = 0; | |
134 | int matched = 0; | |
135 | /* | |
136 | * Open bracket introduces a character class. | |
137 | */ | |
138 | f++; | |
139 | if (*f == '^') { | |
140 | invert = 1; | |
141 | f++; | |
142 | } | |
143 | while (*f != ']') { | |
144 | if (*f == '\\') | |
145 | f++; /* backslashes still work */ | |
146 | if (!*f) | |
147 | return -WC_UNCLOSEDCLASS; /* error again */ | |
148 | if (f[1] == '-') { | |
149 | int lower, upper, ourchr; | |
150 | lower = (unsigned char) *f++; | |
151 | f++; /* eat the minus */ | |
152 | if (*f == ']') | |
153 | return -WC_INVALIDRANGE; /* different error! */ | |
154 | if (*f == '\\') | |
155 | f++; /* backslashes _still_ work */ | |
156 | if (!*f) | |
157 | return -WC_UNCLOSEDCLASS; /* error again */ | |
158 | upper = (unsigned char) *f++; | |
159 | ourchr = (unsigned char) *t; | |
160 | if (lower > upper) { | |
161 | int t = lower; lower = upper; upper = t; | |
162 | } | |
163 | if (ourchr >= lower && ourchr <= upper) | |
164 | matched = 1; | |
165 | } else { | |
166 | matched |= (*t == *f++); | |
167 | } | |
168 | } | |
169 | if (invert == matched) | |
170 | return 0; /* failed to match character class */ | |
171 | f++; /* eat the ] */ | |
172 | } else { | |
173 | /* | |
174 | * Non-special character matches itself. | |
175 | */ | |
176 | if (*f != *t) | |
177 | return 0; | |
178 | f++; | |
179 | } | |
180 | /* | |
181 | * Now we've done that, increment t past the character we | |
182 | * matched. | |
183 | */ | |
184 | t++; | |
185 | } | |
186 | if (!*f || *f == '*') { | |
187 | /* | |
188 | * We have reached the end of f without finding a mismatch; | |
189 | * so we're done. Update the caller pointers and return 1. | |
190 | */ | |
191 | *fragment = f; | |
192 | *target = t; | |
193 | return 1; | |
194 | } | |
195 | /* | |
196 | * Otherwise, we must have reached the end of t before we | |
197 | * reached the end of f; so we've failed. Return 0. | |
198 | */ | |
199 | return 0; | |
200 | } | |
201 | ||
202 | /* | |
203 | * This is the real wildcard matching routine. It returns 1 for a | |
204 | * successful match, 0 for an unsuccessful match, and <0 for a | |
205 | * syntax error in the wildcard. | |
206 | */ | |
207 | int wc_match(const char *wildcard, const char *target) | |
208 | { | |
209 | int ret; | |
210 | ||
211 | /* | |
212 | * Every time we see a '*' _followed_ by a fragment, we just | |
213 | * search along the string for a location at which the fragment | |
214 | * matches. The only special case is when we see a fragment | |
215 | * right at the start, in which case we just call the matching | |
216 | * routine once and give up if it fails. | |
217 | */ | |
218 | if (*wildcard != '*') { | |
219 | ret = wc_match_fragment(&wildcard, &target); | |
220 | if (ret <= 0) | |
221 | return ret; /* pass back failure or error alike */ | |
222 | } | |
223 | ||
224 | while (*wildcard) { | |
225 | assert(*wildcard == '*'); | |
226 | while (*wildcard == '*') | |
227 | wildcard++; | |
228 | ||
229 | /* | |
230 | * It's possible we've just hit the end of the wildcard | |
231 | * after seeing a *, in which case there's no need to | |
232 | * bother searching any more because we've won. | |
233 | */ | |
234 | if (!*wildcard) | |
235 | return 1; | |
236 | ||
237 | /* | |
238 | * Now `wildcard' points at the next fragment. So we | |
239 | * attempt to match it against `target', and if that fails | |
240 | * we increment `target' and try again, and so on. When we | |
241 | * find we're about to try matching against the empty | |
242 | * string, we give up and return 0. | |
243 | */ | |
244 | ret = 0; | |
245 | while (*target) { | |
246 | const char *save_w = wildcard, *save_t = target; | |
247 | ||
248 | ret = wc_match_fragment(&wildcard, &target); | |
249 | ||
250 | if (ret < 0) | |
251 | return ret; /* syntax error */ | |
252 | ||
253 | if (ret > 0 && !*wildcard && *target) { | |
254 | /* | |
255 | * Final special case - literally. | |
256 | * | |
257 | * This situation arises when we are matching a | |
258 | * _terminal_ fragment of the wildcard (that is, | |
259 | * there is nothing after it, e.g. "*a"), and it | |
260 | * has matched _too early_. For example, matching | |
261 | * "*a" against "parka" will match the "a" fragment | |
262 | * against the _first_ a, and then (if it weren't | |
263 | * for this special case) matching would fail | |
264 | * because we're at the end of the wildcard but not | |
265 | * at the end of the target string. | |
266 | * | |
267 | * In this case what we must do is measure the | |
268 | * length of the fragment in the target (which is | |
269 | * why we saved `target'), jump straight to that | |
270 | * distance from the end of the string using | |
271 | * strlen, and match the same fragment again there | |
272 | * (which is why we saved `wildcard'). Then we | |
273 | * return whatever that operation returns. | |
274 | */ | |
275 | target = save_t + strlen(save_t) - (target - save_t); | |
276 | wildcard = save_w; | |
277 | return wc_match_fragment(&wildcard, &target); | |
278 | } | |
279 | ||
280 | if (ret > 0) | |
281 | break; | |
282 | target++; | |
283 | } | |
284 | if (ret > 0) | |
285 | continue; | |
286 | return 0; | |
287 | } | |
288 | ||
289 | /* | |
290 | * If we reach here, it must be because we successfully matched | |
291 | * a fragment and then found ourselves right at the end of the | |
292 | * wildcard. Hence, we return 1 if and only if we are also | |
293 | * right at the end of the target. | |
294 | */ | |
295 | return (*target ? 0 : 1); | |
296 | } | |
297 | ||
298 | /* | |
299 | * Another utility routine that translates a non-wildcard string | |
300 | * into its raw equivalent by removing any escaping backslashes. | |
301 | * Expects a target string buffer of anything up to the length of | |
302 | * the original wildcard. You can also pass NULL as the output | |
303 | * buffer if you're only interested in the return value. | |
304 | * | |
305 | * Returns 1 on success, or 0 if a wildcard character was | |
306 | * encountered. In the latter case the output string MAY not be | |
307 | * zero-terminated and you should not use it for anything! | |
308 | */ | |
309 | int wc_unescape(char *output, const char *wildcard) | |
310 | { | |
311 | while (*wildcard) { | |
312 | if (*wildcard == '\\') { | |
313 | wildcard++; | |
314 | /* We are lenient about trailing backslashes in non-wildcards. */ | |
315 | if (*wildcard) { | |
316 | if (output) | |
317 | *output++ = *wildcard; | |
318 | wildcard++; | |
319 | } | |
320 | } else if (*wildcard == '*' || *wildcard == '?' || | |
321 | *wildcard == '[' || *wildcard == ']') { | |
322 | return 0; /* it's a wildcard! */ | |
323 | } else { | |
324 | if (output) | |
325 | *output++ = *wildcard; | |
326 | wildcard++; | |
327 | } | |
328 | } | |
329 | *output = '\0'; | |
330 | return 1; /* it's clean */ | |
331 | } | |
332 | ||
333 | #ifdef TESTMODE | |
334 | ||
335 | struct test { | |
336 | const char *wildcard; | |
337 | const char *target; | |
338 | int expected_result; | |
339 | }; | |
340 | ||
341 | const struct test fragment_tests[] = { | |
342 | /* | |
343 | * We exhaustively unit-test the fragment matching routine | |
344 | * itself, which should save us the need to test all its | |
345 | * intricacies during the full wildcard tests. | |
346 | */ | |
347 | {"abc", "abc", 1}, | |
348 | {"abc", "abd", 0}, | |
349 | {"abc", "abcd", 1}, | |
350 | {"abcd", "abc", 0}, | |
351 | {"ab[cd]", "abc", 1}, | |
352 | {"ab[cd]", "abd", 1}, | |
353 | {"ab[cd]", "abe", 0}, | |
354 | {"ab[^cd]", "abc", 0}, | |
355 | {"ab[^cd]", "abd", 0}, | |
356 | {"ab[^cd]", "abe", 1}, | |
357 | {"ab\\", "abc", -WC_TRAILINGBACKSLASH}, | |
358 | {"ab\\*", "ab*", 1}, | |
359 | {"ab\\?", "ab*", 0}, | |
360 | {"ab?", "abc", 1}, | |
361 | {"ab?", "ab", 0}, | |
362 | {"ab[", "abc", -WC_UNCLOSEDCLASS}, | |
363 | {"ab[c-", "abb", -WC_UNCLOSEDCLASS}, | |
364 | {"ab[c-]", "abb", -WC_INVALIDRANGE}, | |
365 | {"ab[c-e]", "abb", 0}, | |
366 | {"ab[c-e]", "abc", 1}, | |
367 | {"ab[c-e]", "abd", 1}, | |
368 | {"ab[c-e]", "abe", 1}, | |
369 | {"ab[c-e]", "abf", 0}, | |
370 | {"ab[e-c]", "abb", 0}, | |
371 | {"ab[e-c]", "abc", 1}, | |
372 | {"ab[e-c]", "abd", 1}, | |
373 | {"ab[e-c]", "abe", 1}, | |
374 | {"ab[e-c]", "abf", 0}, | |
375 | {"ab[^c-e]", "abb", 1}, | |
376 | {"ab[^c-e]", "abc", 0}, | |
377 | {"ab[^c-e]", "abd", 0}, | |
378 | {"ab[^c-e]", "abe", 0}, | |
379 | {"ab[^c-e]", "abf", 1}, | |
380 | {"ab[^e-c]", "abb", 1}, | |
381 | {"ab[^e-c]", "abc", 0}, | |
382 | {"ab[^e-c]", "abd", 0}, | |
383 | {"ab[^e-c]", "abe", 0}, | |
384 | {"ab[^e-c]", "abf", 1}, | |
385 | {"ab[a^]", "aba", 1}, | |
386 | {"ab[a^]", "ab^", 1}, | |
387 | {"ab[a^]", "abb", 0}, | |
388 | {"ab[^a^]", "aba", 0}, | |
389 | {"ab[^a^]", "ab^", 0}, | |
390 | {"ab[^a^]", "abb", 1}, | |
391 | {"ab[-c]", "ab-", 1}, | |
392 | {"ab[-c]", "abc", 1}, | |
393 | {"ab[-c]", "abd", 0}, | |
394 | {"ab[^-c]", "ab-", 0}, | |
395 | {"ab[^-c]", "abc", 0}, | |
396 | {"ab[^-c]", "abd", 1}, | |
397 | {"ab[\\[-\\]]", "abZ", 0}, | |
398 | {"ab[\\[-\\]]", "ab[", 1}, | |
399 | {"ab[\\[-\\]]", "ab\\", 1}, | |
400 | {"ab[\\[-\\]]", "ab]", 1}, | |
401 | {"ab[\\[-\\]]", "ab^", 0}, | |
402 | {"ab[^\\[-\\]]", "abZ", 1}, | |
403 | {"ab[^\\[-\\]]", "ab[", 0}, | |
404 | {"ab[^\\[-\\]]", "ab\\", 0}, | |
405 | {"ab[^\\[-\\]]", "ab]", 0}, | |
406 | {"ab[^\\[-\\]]", "ab^", 1}, | |
407 | {"ab[a-fA-F]", "aba", 1}, | |
408 | {"ab[a-fA-F]", "abF", 1}, | |
409 | {"ab[a-fA-F]", "abZ", 0}, | |
410 | }; | |
411 | ||
412 | const struct test full_tests[] = { | |
413 | {"a", "argh", 0}, | |
414 | {"a", "ba", 0}, | |
415 | {"a", "a", 1}, | |
416 | {"a*", "aardvark", 1}, | |
417 | {"a*", "badger", 0}, | |
418 | {"*a", "park", 0}, | |
419 | {"*a", "pArka", 1}, | |
420 | {"*a", "parka", 1}, | |
421 | {"*a*", "park", 1}, | |
422 | {"*a*", "perk", 0}, | |
423 | {"?b*r?", "abracadabra", 1}, | |
424 | {"?b*r?", "abracadabr", 0}, | |
425 | {"?b*r?", "abracadabzr", 0}, | |
426 | }; | |
427 | ||
428 | int main(void) | |
429 | { | |
430 | int i; | |
431 | int fails, passes; | |
432 | ||
433 | fails = passes = 0; | |
434 | ||
435 | for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) { | |
436 | const char *f, *t; | |
437 | int eret, aret; | |
438 | f = fragment_tests[i].wildcard; | |
439 | t = fragment_tests[i].target; | |
440 | eret = fragment_tests[i].expected_result; | |
441 | aret = wc_match_fragment(&f, &t); | |
442 | if (aret != eret) { | |
443 | printf("failed test: /%s/ against /%s/ returned %d not %d\n", | |
444 | fragment_tests[i].wildcard, fragment_tests[i].target, | |
445 | aret, eret); | |
446 | fails++; | |
447 | } else | |
448 | passes++; | |
449 | } | |
450 | ||
451 | for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) { | |
452 | const char *f, *t; | |
453 | int eret, aret; | |
454 | f = full_tests[i].wildcard; | |
455 | t = full_tests[i].target; | |
456 | eret = full_tests[i].expected_result; | |
457 | aret = wc_match(f, t); | |
458 | if (aret != eret) { | |
459 | printf("failed test: /%s/ against /%s/ returned %d not %d\n", | |
460 | full_tests[i].wildcard, full_tests[i].target, | |
461 | aret, eret); | |
462 | fails++; | |
463 | } else | |
464 | passes++; | |
465 | } | |
466 | ||
467 | printf("passed %d, failed %d\n", passes, fails); | |
468 | ||
469 | return 0; | |
470 | } | |
471 | ||
472 | #endif |