]>
Commit | Line | Data |
---|---|---|
78c81ab7 | 1 | /* Regular expression tests. |
688903eb | 2 | Copyright (C) 2003-2018 Free Software Foundation, Inc. |
78c81ab7 UD |
3 | This file is part of the GNU C Library. |
4 | Contributed by Jakub Jelinek <jakub@redhat.com>, 2003. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
78c81ab7 UD |
19 | |
20 | #include <sys/types.h> | |
21 | #include <mcheck.h> | |
22 | #include <regex.h> | |
23 | #include <stdio.h> | |
24 | #include <stdlib.h> | |
25 | #include <string.h> | |
26 | #include <locale.h> | |
27 | #include <getopt.h> | |
28 | ||
29 | static void | |
30 | replace_special_chars (char *str) | |
31 | { | |
32 | for (; (str = strpbrk (str, "NTSZ")) != NULL; ++str) | |
33 | switch (*str) | |
34 | { | |
35 | case 'N': *str = '\n'; break; | |
36 | case 'T': *str = '\t'; break; | |
37 | case 'S': *str = ' '; break; | |
38 | case 'Z': *str = '\0'; break; | |
39 | } | |
40 | } | |
41 | ||
42 | static void | |
43 | glibc_re_syntax (char *str) | |
44 | { | |
45 | char *p, *end = strchr (str, '\0') + 1; | |
46 | ||
47 | /* Replace [[:<:]] with \< and [[:>:]] with \>. */ | |
48 | for (p = str; (p = strstr (p, "[[:")) != NULL; ) | |
49 | if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]", 3) == 0) | |
50 | { | |
51 | p[0] = '\\'; | |
52 | p[1] = p[3]; | |
53 | memmove (p + 2, p + 7, end - p - 7); | |
54 | end -= 5; | |
55 | p += 2; | |
56 | } | |
57 | else | |
58 | p += 3; | |
59 | } | |
60 | ||
61 | static char * | |
62 | mb_replace (char *dst, const char c) | |
63 | { | |
64 | switch (c) | |
65 | { | |
66 | /* Replace a with \'a and A with \'A. */ | |
67 | case 'a': | |
68 | *dst++ = '\xc3'; | |
69 | *dst++ = '\xa1'; | |
70 | break; | |
71 | case 'A': | |
72 | *dst++ = '\xc3'; | |
73 | *dst++ = '\x81'; | |
74 | break; | |
75 | /* Replace b with \v{c} and B with \v{C}. */ | |
76 | case 'b': | |
77 | *dst++ = '\xc4'; | |
78 | *dst++ = '\x8d'; | |
79 | break; | |
80 | case 'B': | |
81 | *dst++ = '\xc4'; | |
82 | *dst++ = '\x8c'; | |
83 | break; | |
84 | /* Replace c with \v{d} and C with \v{D}. */ | |
85 | case 'c': | |
86 | *dst++ = '\xc4'; | |
87 | *dst++ = '\x8f'; | |
88 | break; | |
89 | case 'C': | |
90 | *dst++ = '\xc4'; | |
91 | *dst++ = '\x8e'; | |
92 | break; | |
93 | /* Replace d with \'e and D with \'E. */ | |
94 | case 'd': | |
95 | *dst++ = '\xc3'; | |
96 | *dst++ = '\xa9'; | |
97 | break; | |
98 | case 'D': | |
99 | *dst++ = '\xc3'; | |
100 | *dst++ = '\x89'; | |
101 | break; | |
102 | } | |
103 | return dst; | |
104 | } | |
105 | ||
106 | static char * | |
107 | mb_frob_string (const char *str, const char *letters) | |
108 | { | |
109 | char *ret, *dst; | |
110 | const char *src; | |
111 | ||
112 | if (str == NULL) | |
113 | return NULL; | |
114 | ||
115 | ret = malloc (2 * strlen (str) + 1); | |
116 | if (ret == NULL) | |
117 | return NULL; | |
118 | ||
119 | for (src = str, dst = ret; *src; ++src) | |
120 | if (strchr (letters, *src)) | |
121 | dst = mb_replace (dst, *src); | |
122 | else | |
123 | *dst++ = *src; | |
124 | *dst = '\0'; | |
125 | return ret; | |
126 | } | |
127 | ||
128 | /* Like mb_frob_string, but don't replace anything between | |
24992143 UD |
129 | [: and :], [. and .] or [= and =] or characters escaped |
130 | with a backslash. */ | |
78c81ab7 UD |
131 | |
132 | static char * | |
133 | mb_frob_pattern (const char *str, const char *letters) | |
134 | { | |
135 | char *ret, *dst; | |
136 | const char *src; | |
24992143 | 137 | int in_class = 0, escaped = 0; |
78c81ab7 UD |
138 | |
139 | if (str == NULL) | |
140 | return NULL; | |
141 | ||
142 | ret = malloc (2 * strlen (str) + 1); | |
143 | if (ret == NULL) | |
144 | return NULL; | |
145 | ||
146 | for (src = str, dst = ret; *src; ++src) | |
24992143 UD |
147 | if (*src == '\\') |
148 | { | |
149 | escaped ^= 1; | |
150 | *dst++ = *src; | |
151 | } | |
152 | else if (escaped) | |
153 | { | |
154 | escaped = 0; | |
155 | *dst++ = *src; | |
156 | continue; | |
157 | } | |
158 | else if (!in_class && strchr (letters, *src)) | |
78c81ab7 UD |
159 | dst = mb_replace (dst, *src); |
160 | else | |
161 | { | |
162 | if (!in_class && *src == '[' && strchr (":.=", src[1])) | |
163 | in_class = 1; | |
164 | else if (in_class && *src == ']' && strchr (":.=", src[-1])) | |
165 | in_class = 0; | |
166 | *dst++ = *src; | |
167 | } | |
168 | *dst = '\0'; | |
169 | return ret; | |
170 | } | |
171 | ||
172 | static int | |
173 | check_match (regmatch_t *rm, int idx, const char *string, | |
174 | const char *match, const char *fail) | |
175 | { | |
176 | if (match[0] == '-' && match[1] == '\0') | |
177 | { | |
178 | if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1) | |
179 | return 0; | |
180 | printf ("%s rm[%d] unexpectedly matched\n", fail, idx); | |
181 | return 1; | |
182 | } | |
183 | ||
184 | if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1) | |
185 | { | |
186 | printf ("%s rm[%d] unexpectedly did not match\n", fail, idx); | |
187 | return 1; | |
188 | } | |
189 | ||
190 | if (match[0] == '@') | |
191 | { | |
192 | if (rm[idx].rm_so != rm[idx].rm_eo) | |
193 | { | |
194 | printf ("%s rm[%d] not empty\n", fail, idx); | |
195 | return 1; | |
196 | } | |
197 | ||
ecdcadbd | 198 | if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1) ?: 1)) |
78c81ab7 UD |
199 | { |
200 | printf ("%s rm[%d] not matching %s\n", fail, idx, match); | |
201 | return 1; | |
202 | } | |
203 | return 0; | |
204 | } | |
205 | ||
206 | if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match) | |
207 | || strncmp (string + rm[idx].rm_so, match, | |
208 | rm[idx].rm_eo - rm[idx].rm_so)) | |
209 | { | |
210 | printf ("%s rm[%d] not matching %s\n", fail, idx, match); | |
211 | return 1; | |
212 | } | |
213 | ||
214 | return 0; | |
215 | } | |
216 | ||
217 | static int | |
218 | test (const char *pattern, int cflags, const char *string, int eflags, | |
219 | char *expect, char *matches, const char *fail) | |
220 | { | |
221 | regex_t re; | |
222 | regmatch_t rm[10]; | |
223 | int n, ret = 0; | |
224 | ||
225 | n = regcomp (&re, pattern, cflags); | |
226 | if (n != 0) | |
227 | { | |
8cae99db | 228 | char buf[500]; |
78c81ab7 UD |
229 | if (eflags == -1) |
230 | { | |
231 | static struct { reg_errcode_t code; const char *name; } codes [] | |
232 | #define C(x) { REG_##x, #x } | |
233 | = { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE), | |
234 | C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK), | |
235 | C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE), | |
236 | C(ESPACE), C(BADRPT) }; | |
237 | ||
238 | for (int i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i) | |
239 | if (n == codes[i].code) | |
240 | { | |
241 | if (strcmp (string, codes[i].name)) | |
242 | { | |
243 | printf ("%s regcomp returned REG_%s (expected REG_%s)\n", | |
244 | fail, codes[i].name, string); | |
245 | return 1; | |
246 | } | |
247 | return 0; | |
248 | } | |
249 | ||
250 | printf ("%s regcomp return value REG_%d\n", fail, n); | |
251 | return 1; | |
252 | } | |
253 | ||
78c81ab7 UD |
254 | regerror (n, &re, buf, sizeof (buf)); |
255 | printf ("%s regcomp failed: %s\n", fail, buf); | |
256 | return 1; | |
257 | } | |
258 | ||
259 | if (eflags == -1) | |
260 | { | |
261 | regfree (&re); | |
262 | ||
263 | /* The test case file assumes something only guaranteed by the | |
264 | rxspencer regex implementation. Namely that for empty | |
265 | expressions regcomp() return REG_EMPTY. This is not the case | |
266 | for us and so we ignore this error. */ | |
267 | if (strcmp (string, "EMPTY") == 0) | |
268 | return 0; | |
269 | ||
270 | printf ("%s regcomp unexpectedly succeeded\n", fail); | |
271 | return 1; | |
272 | } | |
273 | ||
274 | if (regexec (&re, string, 10, rm, eflags)) | |
275 | { | |
276 | regfree (&re); | |
277 | if (expect == NULL) | |
278 | return 0; | |
279 | printf ("%s regexec failed\n", fail); | |
280 | return 1; | |
281 | } | |
282 | ||
283 | regfree (&re); | |
284 | ||
285 | if (expect == NULL) | |
286 | { | |
287 | printf ("%s regexec unexpectedly succeeded\n", fail); | |
288 | return 1; | |
289 | } | |
290 | ||
291 | if (cflags & REG_NOSUB) | |
292 | return 0; | |
293 | ||
294 | ret = check_match (rm, 0, string, expect, fail); | |
295 | if (matches == NULL) | |
296 | return ret; | |
297 | ||
298 | for (n = 1; ret == 0 && n < 10; ++n) | |
299 | { | |
300 | char *p = NULL; | |
301 | ||
302 | if (matches) | |
303 | { | |
304 | p = strchr (matches, ','); | |
305 | if (p != NULL) | |
306 | *p = '\0'; | |
307 | } | |
308 | ret = check_match (rm, n, string, matches ?: "-", fail); | |
309 | if (p) | |
310 | { | |
311 | *p = ','; | |
312 | matches = p + 1; | |
313 | } | |
314 | else | |
315 | matches = NULL; | |
316 | } | |
317 | ||
318 | return ret; | |
319 | } | |
320 | ||
321 | static int | |
322 | mb_test (const char *pattern, int cflags, const char *string, int eflags, | |
323 | char *expect, const char *matches, const char *letters, | |
324 | const char *fail) | |
325 | { | |
326 | char *pattern_mb = mb_frob_pattern (pattern, letters); | |
327 | const char *string_mb | |
328 | = eflags == -1 ? string : mb_frob_string (string, letters); | |
329 | char *expect_mb = mb_frob_string (expect, letters); | |
330 | char *matches_mb = mb_frob_string (matches, letters); | |
331 | int ret = 0; | |
332 | ||
333 | if (!pattern_mb || !string_mb | |
334 | || (expect && !expect_mb) || (matches && !matches_mb)) | |
335 | { | |
336 | printf ("%s %m", fail); | |
337 | ret = 1; | |
338 | } | |
339 | else | |
340 | ret = test (pattern_mb, cflags, string_mb, eflags, expect_mb, | |
341 | matches_mb, fail); | |
342 | ||
343 | free (matches_mb); | |
344 | free (expect_mb); | |
345 | if (string_mb != string) | |
346 | free ((char *) string_mb); | |
347 | free (pattern_mb); | |
348 | return ret; | |
349 | } | |
350 | ||
351 | static int | |
352 | mb_tests (const char *pattern, int cflags, const char *string, int eflags, | |
353 | char *expect, const char *matches) | |
354 | { | |
355 | int ret = 0; | |
356 | int i; | |
357 | char letters[9], fail[20]; | |
358 | ||
359 | /* The tests aren't supposed to work with xdigit, since a-dA-D are | |
360 | hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not. */ | |
361 | if (strstr (pattern, "[:xdigit:]")) | |
362 | return 0; | |
363 | ||
c13c99fa UD |
364 | /* XXX: regex ATM handles only single byte equivalence classes. */ |
365 | if (strstr (pattern, "[[=b=]]")) | |
366 | return 0; | |
367 | ||
78c81ab7 UD |
368 | for (i = 1; i < 16; ++i) |
369 | { | |
370 | char *p = letters; | |
53f9084e UD |
371 | if (i & 1) |
372 | { | |
373 | if (!strchr (pattern, 'a') && !strchr (string, 'a') | |
374 | && !strchr (pattern, 'A') && !strchr (string, 'A')) | |
375 | continue; | |
376 | *p++ = 'a', *p++ = 'A'; | |
377 | } | |
378 | if (i & 2) | |
379 | { | |
380 | if (!strchr (pattern, 'b') && !strchr (string, 'b') | |
381 | && !strchr (pattern, 'B') && !strchr (string, 'B')) | |
382 | continue; | |
383 | *p++ = 'b', *p++ = 'B'; | |
384 | } | |
385 | if (i & 4) | |
386 | { | |
387 | if (!strchr (pattern, 'c') && !strchr (string, 'c') | |
388 | && !strchr (pattern, 'C') && !strchr (string, 'C')) | |
389 | continue; | |
390 | *p++ = 'c', *p++ = 'C'; | |
391 | } | |
392 | if (i & 8) | |
393 | { | |
394 | if (!strchr (pattern, 'd') && !strchr (string, 'd') | |
395 | && !strchr (pattern, 'D') && !strchr (string, 'D')) | |
396 | continue; | |
397 | *p++ = 'd', *p++ = 'D'; | |
398 | } | |
78c81ab7 UD |
399 | *p++ = '\0'; |
400 | sprintf (fail, "UTF-8 %s FAIL", letters); | |
401 | ret |= mb_test (pattern, cflags, string, eflags, expect, matches, | |
402 | letters, fail); | |
403 | } | |
404 | return ret; | |
405 | } | |
406 | ||
407 | int | |
408 | main (int argc, char **argv) | |
409 | { | |
410 | int ret = 0; | |
411 | char *line = NULL; | |
412 | size_t line_len = 0; | |
413 | ssize_t len; | |
414 | FILE *f; | |
415 | static int test_utf8 = 0; | |
416 | static const struct option options[] = | |
417 | { | |
418 | {"utf8", no_argument, &test_utf8, 1}, | |
419 | {NULL, 0, NULL, 0 } | |
420 | }; | |
421 | ||
ee70274a UD |
422 | mtrace (); |
423 | ||
25ce4c6b | 424 | while (getopt_long (argc, argv, "", options, NULL) >= 0); |
78c81ab7 UD |
425 | |
426 | if (optind + 1 != argc) | |
427 | { | |
428 | fprintf (stderr, "Missing test filename\n"); | |
429 | return 1; | |
430 | } | |
431 | ||
432 | f = fopen (argv[optind], "r"); | |
433 | if (f == NULL) | |
434 | { | |
ecdcadbd | 435 | fprintf (stderr, "Couldn't open %s\n", argv[optind]); |
78c81ab7 UD |
436 | return 1; |
437 | } | |
438 | ||
439 | while ((len = getline (&line, &line_len, f)) > 0) | |
440 | { | |
441 | char *pattern, *flagstr, *string, *expect, *matches, *p; | |
442 | int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0; | |
443 | ||
444 | if (line[len - 1] == '\n') | |
445 | line[len - 1] = '\0'; | |
446 | ||
447 | /* Skip comments and empty lines. */ | |
448 | if (*line == '#' || *line == '\0') | |
449 | continue; | |
450 | ||
451 | puts (line); | |
452 | fflush (stdout); | |
453 | ||
454 | pattern = strtok (line, "\t"); | |
455 | if (pattern == NULL) | |
456 | continue; | |
457 | ||
458 | if (strcmp (pattern, "\"\"") == 0) | |
459 | pattern += 2; | |
460 | ||
461 | flagstr = strtok (NULL, "\t"); | |
462 | if (flagstr == NULL) | |
463 | continue; | |
464 | ||
465 | string = strtok (NULL, "\t"); | |
466 | if (string == NULL) | |
467 | continue; | |
468 | ||
469 | if (strcmp (string, "\"\"") == 0) | |
470 | string += 2; | |
471 | ||
472 | for (p = flagstr; *p; ++p) | |
473 | switch (*p) | |
474 | { | |
475 | case '-': | |
476 | break; | |
477 | case 'b': | |
478 | cflags &= ~REG_EXTENDED; | |
479 | break; | |
480 | case '&': | |
481 | try_bre_ere = 1; | |
482 | break; | |
483 | case 'C': | |
484 | eflags = -1; | |
485 | break; | |
486 | case 'i': | |
487 | cflags |= REG_ICASE; | |
488 | break; | |
489 | case 's': | |
490 | cflags |= REG_NOSUB; | |
491 | break; | |
492 | case 'n': | |
493 | cflags |= REG_NEWLINE; | |
494 | break; | |
495 | case '^': | |
496 | eflags |= REG_NOTBOL; | |
497 | break; | |
498 | case '$': | |
499 | eflags |= REG_NOTEOL; | |
500 | break; | |
501 | case 'm': | |
502 | case 'p': | |
503 | case '#': | |
504 | /* Not supported. */ | |
505 | flagstr = NULL; | |
506 | break; | |
507 | } | |
508 | ||
509 | if (flagstr == NULL) | |
510 | continue; | |
511 | ||
512 | replace_special_chars (pattern); | |
513 | glibc_re_syntax (pattern); | |
514 | if (eflags != -1) | |
515 | replace_special_chars (string); | |
516 | ||
517 | expect = strtok (NULL, "\t"); | |
518 | matches = NULL; | |
519 | if (expect != NULL) | |
520 | { | |
521 | replace_special_chars (expect); | |
522 | matches = strtok (NULL, "\t"); | |
523 | if (matches != NULL) | |
524 | replace_special_chars (matches); | |
525 | } | |
526 | ||
c13c99fa UD |
527 | if (setlocale (LC_ALL, "C") == NULL) |
528 | { | |
529 | puts ("setlocale C failed"); | |
530 | ret = 1; | |
531 | } | |
78c81ab7 UD |
532 | if (test (pattern, cflags, string, eflags, expect, matches, "FAIL") |
533 | || (try_bre_ere | |
534 | && test (pattern, cflags & ~REG_EXTENDED, string, eflags, | |
535 | expect, matches, "FAIL"))) | |
536 | ret = 1; | |
537 | else if (test_utf8) | |
538 | { | |
c13c99fa UD |
539 | if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL) |
540 | { | |
541 | puts ("setlocale cs_CZ.UTF-8 failed"); | |
542 | ret = 1; | |
543 | } | |
544 | else if (test (pattern, cflags, string, eflags, expect, matches, | |
545 | "UTF-8 FAIL") | |
546 | || (try_bre_ere | |
547 | && test (pattern, cflags & ~REG_EXTENDED, string, | |
548 | eflags, expect, matches, "UTF-8 FAIL"))) | |
78c81ab7 UD |
549 | ret = 1; |
550 | else if (mb_tests (pattern, cflags, string, eflags, expect, matches) | |
551 | || (try_bre_ere | |
552 | && mb_tests (pattern, cflags & ~REG_EXTENDED, string, | |
553 | eflags, expect, matches))) | |
554 | ret = 1; | |
555 | } | |
556 | } | |
557 | ||
ee70274a | 558 | free (line); |
78c81ab7 UD |
559 | fclose (f); |
560 | return ret; | |
561 | } |