]> git.ipfire.org Git - thirdparty/bash.git/blob - lib/glob/smatch.c
379c2d2e62d5e8452706a1b5a4ba5d5f364bf5fc
[thirdparty/bash.git] / lib / glob / smatch.c
1 /* strmatch.c -- ksh-like extended pattern matching for the shell and filename
2 globbing. */
3
4 /* Copyright (C) 1991-2021 Free Software Foundation, Inc.
5
6 This file is part of GNU Bash, the Bourne Again SHell.
7
8 Bash is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 Bash is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with Bash. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include <config.h>
23
24 #include <stdio.h> /* for debugging */
25
26 #include "strmatch.h"
27 #include <chartypes.h>
28
29 #include "bashansi.h"
30 #include "shmbutil.h"
31 #include "xmalloc.h"
32
33 #include <errno.h>
34
35 #if !defined (errno)
36 extern int errno;
37 #endif
38
39 #if FNMATCH_EQUIV_FALLBACK
40 /* We don't include <fnmatch.h> in order to avoid namespace collisions; the
41 internal strmatch still uses the FNM_ constants. */
42 extern int fnmatch (const char *, const char *, int);
43 #endif
44
45 /* First, compile `sm_loop.c' for single-byte characters. */
46 #define CHAR unsigned char
47 #define U_CHAR unsigned char
48 #define XCHAR char
49 #define INT int
50 #define L(CS) CS
51 #define INVALID -1
52
53 #undef STREQ
54 #undef STREQN
55 #define STREQ(a, b) ((a)[0] == (b)[0] && strcmp(a, b) == 0)
56 #define STREQN(a, b, n) ((a)[0] == (b)[0] && strncmp(a, b, n) == 0)
57
58 #ifndef GLOBASCII_DEFAULT
59 # define GLOBASCII_DEFAULT 0
60 #endif
61
62 int glob_asciirange = GLOBASCII_DEFAULT;
63
64 #if FNMATCH_EQUIV_FALLBACK
65 /* Construct a string w1 = "c1" and a pattern w2 = "[[=c2=]]" and pass them
66 to fnmatch to see if wide characters c1 and c2 collate as members of the
67 same equivalence class. We can't really do this portably any other way */
68 static int
69 _fnmatch_fallback (s, p)
70 int s, p; /* string char, patchar */
71 {
72 char s1[2]; /* string */
73 char s2[8]; /* constructed pattern */
74
75 s1[0] = (unsigned char)s;
76 s1[1] = '\0';
77
78 /* reconstruct the pattern */
79 s2[0] = s2[1] = '[';
80 s2[2] = '=';
81 s2[3] = (unsigned char)p;
82 s2[4] = '=';
83 s2[5] = s2[6] = ']';
84 s2[7] = '\0';
85
86 return (fnmatch ((const char *)s2, (const char *)s1, 0));
87 }
88 #endif
89
90 /* We use strcoll(3) for range comparisons in bracket expressions,
91 even though it can have unwanted side effects in locales
92 other than POSIX or US. For instance, in the de locale, [A-Z] matches
93 all characters. If GLOB_ASCIIRANGE is non-zero, and we're not forcing
94 the use of strcoll (e.g., for explicit collating symbols), we use
95 straight ordering as if in the C locale. */
96
97 #if defined (HAVE_STRCOLL)
98 /* Helper functions for collating symbol equivalence. */
99
100 /* Return 0 if C1 == C2 or collates equally if FORCECOLL is non-zero. */
101 static int
102 charcmp (c1, c2, forcecoll)
103 int c1, c2;
104 int forcecoll;
105 {
106 static char s1[2] = { ' ', '\0' };
107 static char s2[2] = { ' ', '\0' };
108 int ret;
109
110 /* Eight bits only. Period. */
111 c1 &= 0xFF;
112 c2 &= 0xFF;
113
114 if (c1 == c2)
115 return (0);
116
117 if (forcecoll == 0 && glob_asciirange)
118 return (c1 - c2);
119
120 s1[0] = c1;
121 s2[0] = c2;
122
123 return (strcoll (s1, s2));
124 }
125
126 static int
127 rangecmp (c1, c2, forcecoll)
128 int c1, c2;
129 int forcecoll;
130 {
131 int r;
132
133 r = charcmp (c1, c2, forcecoll);
134
135 /* We impose a total ordering here by returning c1-c2 if charcmp returns 0 */
136 if (r != 0)
137 return r;
138 return (c1 - c2); /* impose total ordering */
139 }
140 #else /* !HAVE_STRCOLL */
141 # define rangecmp(c1, c2, f) ((int)(c1) - (int)(c2))
142 #endif /* !HAVE_STRCOLL */
143
144 #if defined (HAVE_STRCOLL)
145 /* Returns 1 if chars C and EQUIV collate equally in the current locale. */
146 static int
147 collequiv (c, equiv)
148 int c, equiv;
149 {
150 if (charcmp (c, equiv, 1) == 0)
151 return 1;
152
153 #if FNMATCH_EQUIV_FALLBACK
154 return (_fnmatch_fallback (c, equiv) == 0);
155 #else
156 return 0;
157 #endif
158
159 }
160 #else
161 # define collequiv(c, equiv) ((c) == (equiv))
162 #endif
163
164 #define _COLLSYM _collsym
165 #define __COLLSYM __collsym
166 #define POSIXCOLL posix_collsyms
167 #include "collsyms.h"
168
169 static int
170 collsym (s, len)
171 CHAR *s;
172 int len;
173 {
174 register struct _collsym *csp;
175 char *x;
176
177 x = (char *)s;
178 for (csp = posix_collsyms; csp->name; csp++)
179 {
180 if (STREQN(csp->name, x, len) && csp->name[len] == '\0')
181 return (csp->code);
182 }
183 if (len == 1)
184 return s[0];
185 return INVALID;
186 }
187
188 /* unibyte character classification */
189 #if !defined (isascii) && !defined (HAVE_ISASCII)
190 # define isascii(c) ((unsigned int)(c) <= 0177)
191 #endif
192
193 enum char_class
194 {
195 CC_NO_CLASS = 0,
196 CC_ASCII, CC_ALNUM, CC_ALPHA, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
197 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_WORD, CC_XDIGIT
198 };
199
200 static char const *const cclass_name[] =
201 {
202 "",
203 "ascii", "alnum", "alpha", "blank", "cntrl", "digit", "graph",
204 "lower", "print", "punct", "space", "upper", "word", "xdigit"
205 };
206
207 #define N_CHAR_CLASS (sizeof(cclass_name) / sizeof (cclass_name[0]))
208
209 static enum char_class
210 is_valid_cclass (name)
211 const char *name;
212 {
213 enum char_class ret;
214 int i;
215
216 ret = CC_NO_CLASS;
217
218 for (i = 1; i < N_CHAR_CLASS; i++)
219 {
220 if (STREQ (name, cclass_name[i]))
221 {
222 ret = (enum char_class)i;
223 break;
224 }
225 }
226
227 return ret;
228 }
229
230 static int
231 cclass_test (c, char_class)
232 int c;
233 enum char_class char_class;
234 {
235 int result;
236
237 switch (char_class)
238 {
239 case CC_ASCII:
240 result = isascii (c);
241 break;
242 case CC_ALNUM:
243 result = ISALNUM (c);
244 break;
245 case CC_ALPHA:
246 result = ISALPHA (c);
247 break;
248 case CC_BLANK:
249 result = ISBLANK (c);
250 break;
251 case CC_CNTRL:
252 result = ISCNTRL (c);
253 break;
254 case CC_DIGIT:
255 result = ISDIGIT (c);
256 break;
257 case CC_GRAPH:
258 result = ISGRAPH (c);
259 break;
260 case CC_LOWER:
261 result = ISLOWER (c);
262 break;
263 case CC_PRINT:
264 result = ISPRINT (c);
265 break;
266 case CC_PUNCT:
267 result = ISPUNCT (c);
268 break;
269 case CC_SPACE:
270 result = ISSPACE (c);
271 break;
272 case CC_UPPER:
273 result = ISUPPER (c);
274 break;
275 case CC_WORD:
276 result = (ISALNUM (c) || c == '_');
277 break;
278 case CC_XDIGIT:
279 result = ISXDIGIT (c);
280 break;
281 default:
282 result = -1;
283 break;
284 }
285
286 return result;
287 }
288
289 static int
290 is_cclass (c, name)
291 int c;
292 const char *name;
293 {
294 enum char_class char_class;
295 int result;
296
297 char_class = is_valid_cclass (name);
298 if (char_class == CC_NO_CLASS)
299 return -1;
300
301 result = cclass_test (c, char_class);
302 return (result);
303 }
304
305 /* Now include `sm_loop.c' for single-byte characters. */
306 /* The result of FOLD is an `unsigned char' */
307 # define FOLD(c) ((flags & FNM_CASEFOLD) \
308 ? TOLOWER ((unsigned char)c) \
309 : ((unsigned char)c))
310
311 #if !defined (__CYGWIN__)
312 # define ISDIRSEP(c) ((c) == '/')
313 #else
314 # define ISDIRSEP(c) ((c) == '/' || (c) == '\\')
315 #endif /* __CYGWIN__ */
316 #define PATHSEP(c) (ISDIRSEP(c) || (c) == 0)
317
318 # define PDOT_OR_DOTDOT(s) (s[0] == '.' && (PATHSEP (s[1]) || (s[1] == '.' && PATHSEP (s[2]))))
319 # define SDOT_OR_DOTDOT(s) (s[0] == '.' && (s[1] == 0 || (s[1] == '.' && s[2] == 0)))
320
321 #define FCT internal_strmatch
322 #define GMATCH gmatch
323 #define COLLSYM collsym
324 #define PARSE_COLLSYM parse_collsym
325 #define BRACKMATCH brackmatch
326 #define PATSCAN glob_patscan
327 #define STRCOMPARE strcompare
328 #define EXTMATCH extmatch
329 #define DEQUOTE_PATHNAME udequote_pathname
330 #define STRUCT smat_struct
331 #define STRCHR(S, C) strchr((S), (C))
332 #define MEMCHR(S, C, N) memchr((S), (C), (N))
333 #define STRCOLL(S1, S2) strcoll((S1), (S2))
334 #define STRLEN(S) strlen(S)
335 #define STRCMP(S1, S2) strcmp((S1), (S2))
336 #define RANGECMP(C1, C2, F) rangecmp((C1), (C2), (F))
337 #define COLLEQUIV(C1, C2) collequiv((C1), (C2))
338 #define CTYPE_T enum char_class
339 #define IS_CCLASS(C, S) is_cclass((C), (S))
340 #include "sm_loop.c"
341
342 #if HANDLE_MULTIBYTE
343
344 # define CHAR wchar_t
345 # define U_CHAR wint_t
346 # define XCHAR wchar_t
347 # define INT wint_t
348 # define L(CS) L##CS
349 # define INVALID WEOF
350
351 # undef STREQ
352 # undef STREQN
353 # define STREQ(s1, s2) ((wcscmp (s1, s2) == 0))
354 # define STREQN(a, b, n) ((a)[0] == (b)[0] && wcsncmp(a, b, n) == 0)
355
356 extern char *mbsmbchar PARAMS((const char *));
357
358 #if FNMATCH_EQUIV_FALLBACK
359 /* Construct a string w1 = "c1" and a pattern w2 = "[[=c2=]]" and pass them
360 to fnmatch to see if wide characters c1 and c2 collate as members of the
361 same equivalence class. We can't really do this portably any other way */
362 static int
363 _fnmatch_fallback_wc (c1, c2)
364 wchar_t c1, c2; /* string char, patchar */
365 {
366 char w1[MB_LEN_MAX+1]; /* string */
367 char w2[MB_LEN_MAX+8]; /* constructed pattern */
368 int l1, l2;
369
370 l1 = wctomb (w1, c1);
371 if (l1 == -1)
372 return (2);
373 w1[l1] = '\0';
374
375 /* reconstruct the pattern */
376 w2[0] = w2[1] = '[';
377 w2[2] = '=';
378 l2 = wctomb (w2+3, c2);
379 if (l2 == -1)
380 return (2);
381 w2[l2+3] = '=';
382 w2[l2+4] = w2[l2+5] = ']';
383 w2[l2+6] = '\0';
384
385 return (fnmatch ((const char *)w2, (const char *)w1, 0));
386 }
387 #endif
388
389 static int
390 charcmp_wc (c1, c2, forcecoll)
391 wint_t c1, c2;
392 int forcecoll;
393 {
394 static wchar_t s1[2] = { L' ', L'\0' };
395 static wchar_t s2[2] = { L' ', L'\0' };
396 int r;
397
398 if (c1 == c2)
399 return 0;
400
401 if (forcecoll == 0 && glob_asciirange && c1 <= UCHAR_MAX && c2 <= UCHAR_MAX)
402 return ((int)(c1 - c2));
403
404 s1[0] = c1;
405 s2[0] = c2;
406
407 return (wcscoll (s1, s2));
408 }
409
410 static int
411 rangecmp_wc (c1, c2, forcecoll)
412 wint_t c1, c2;
413 int forcecoll;
414 {
415 int r;
416
417 r = charcmp_wc (c1, c2, forcecoll);
418
419 /* We impose a total ordering here by returning c1-c2 if charcmp returns 0,
420 as we do above in the single-byte case. */
421 if (r != 0 || forcecoll)
422 return r;
423 return ((int)(c1 - c2)); /* impose total ordering */
424 }
425
426 /* Returns 1 if wide chars C and EQUIV collate equally in the current locale. */
427 static int
428 collequiv_wc (c, equiv)
429 wint_t c, equiv;
430 {
431 wchar_t s, p;
432
433 if (charcmp_wc (c, equiv, 1) == 0)
434 return 1;
435
436 #if FNMATCH_EQUIV_FALLBACK
437 /* We check explicitly for success (fnmatch returns 0) to avoid problems if
438 our local definition of FNM_NOMATCH (strmatch.h) doesn't match the
439 system's (fnmatch.h). We don't care about error return values here. */
440
441 s = c;
442 p = equiv;
443 return (_fnmatch_fallback_wc (s, p) == 0);
444 #else
445 return 0;
446 #endif
447 }
448
449 /* Helper function for collating symbol. */
450 # define _COLLSYM _collwcsym
451 # define __COLLSYM __collwcsym
452 # define POSIXCOLL posix_collwcsyms
453 # include "collsyms.h"
454
455 static wint_t
456 collwcsym (s, len)
457 wchar_t *s;
458 int len;
459 {
460 register struct _collwcsym *csp;
461
462 for (csp = posix_collwcsyms; csp->name; csp++)
463 {
464 if (STREQN(csp->name, s, len) && csp->name[len] == L'\0')
465 return (csp->code);
466 }
467 if (len == 1)
468 return s[0];
469 return INVALID;
470 }
471
472 static int
473 is_wcclass (wc, name)
474 wint_t wc;
475 wchar_t *name;
476 {
477 char *mbs;
478 mbstate_t state;
479 size_t mbslength;
480 wctype_t desc;
481 int want_word;
482
483 if ((wctype ("ascii") == (wctype_t)0) && (wcscmp (name, L"ascii") == 0))
484 {
485 int c;
486
487 if ((c = wctob (wc)) == EOF)
488 return 0;
489 else
490 return (c <= 0x7F);
491 }
492
493 want_word = (wcscmp (name, L"word") == 0);
494 if (want_word)
495 name = L"alnum";
496
497 memset (&state, '\0', sizeof (mbstate_t));
498 mbs = (char *) malloc (wcslen(name) * MB_CUR_MAX + 1);
499 if (mbs == 0)
500 return -1;
501 mbslength = wcsrtombs (mbs, (const wchar_t **)&name, (wcslen(name) * MB_CUR_MAX + 1), &state);
502
503 if (mbslength == (size_t)-1 || mbslength == (size_t)-2)
504 {
505 free (mbs);
506 return -1;
507 }
508 desc = wctype (mbs);
509 free (mbs);
510
511 if (desc == (wctype_t)0)
512 return -1;
513
514 if (want_word)
515 return (iswctype (wc, desc) || wc == L'_');
516 else
517 return (iswctype (wc, desc));
518 }
519
520 /* Return 1 if there are no char class [:class:] expressions (degenerate case)
521 or only posix-specified (C locale supported) char class expressions in
522 PATTERN. These are the ones where it's safe to punt to the single-byte
523 code, since wide character support allows locale-defined char classes.
524 This only uses single-byte code, but is only needed to support multibyte
525 locales. */
526 static int
527 posix_cclass_only (pattern)
528 char *pattern;
529 {
530 char *p, *p1;
531 char cc[16]; /* sufficient for all valid posix char class names */
532 enum char_class valid;
533
534 p = pattern;
535 while (p = strchr (p, '['))
536 {
537 if (p[1] != ':')
538 {
539 p++;
540 continue;
541 }
542 p += 2; /* skip past "[:" */
543 /* Find end of char class expression */
544 for (p1 = p; *p1; p1++)
545 if (*p1 == ':' && p1[1] == ']')
546 break;
547 if (*p1 == 0) /* no char class expression found */
548 break;
549 /* Find char class name and validate it against posix char classes */
550 if ((p1 - p) >= sizeof (cc))
551 return 0;
552 bcopy (p, cc, p1 - p);
553 cc[p1 - p] = '\0';
554 valid = is_valid_cclass (cc);
555 if (valid == CC_NO_CLASS)
556 return 0; /* found unrecognized char class name */
557
558 p = p1 + 2; /* found posix char class name */
559 }
560
561 return 1; /* no char class names or only posix */
562 }
563
564 /* Now include `sm_loop.c' for multibyte characters. */
565 #define FOLD(c) ((flags & FNM_CASEFOLD) && iswupper (c) ? towlower (c) : (c))
566
567 # if !defined (__CYGWIN__)
568 # define ISDIRSEP(c) ((c) == L'/')
569 # else
570 # define ISDIRSEP(c) ((c) == L'/' || (c) == L'\\')
571 # endif /* __CYGWIN__ */
572 # define PATHSEP(c) (ISDIRSEP(c) || (c) == L'\0')
573
574 # define PDOT_OR_DOTDOT(w) (w[0] == L'.' && (PATHSEP(w[1]) || (w[1] == L'.' && PATHSEP(w[2]))))
575 # define SDOT_OR_DOTDOT(w) (w[0] == L'.' && (w[1] == L'\0' || (w[1] == L'.' && w[2] == L'\0')))
576
577 #define FCT internal_wstrmatch
578 #define GMATCH gmatch_wc
579 #define COLLSYM collwcsym
580 #define PARSE_COLLSYM parse_collwcsym
581 #define BRACKMATCH brackmatch_wc
582 #define PATSCAN glob_patscan_wc
583 #define STRCOMPARE wscompare
584 #define EXTMATCH extmatch_wc
585 #define DEQUOTE_PATHNAME wcdequote_pathname
586 #define STRUCT wcsmat_struct
587 #define STRCHR(S, C) wcschr((S), (C))
588 #define MEMCHR(S, C, N) wmemchr((S), (C), (N))
589 #define STRCOLL(S1, S2) wcscoll((S1), (S2))
590 #define STRLEN(S) wcslen(S)
591 #define STRCMP(S1, S2) wcscmp((S1), (S2))
592 #define RANGECMP(C1, C2, F) rangecmp_wc((C1), (C2), (F))
593 #define COLLEQUIV(C1, C2) collequiv_wc((C1), (C2))
594 #define CTYPE_T enum char_class
595 #define IS_CCLASS(C, S) is_wcclass((C), (S))
596 #include "sm_loop.c"
597
598 #endif /* HAVE_MULTIBYTE */
599
600 int
601 xstrmatch (pattern, string, flags)
602 char *pattern;
603 char *string;
604 int flags;
605 {
606 #if HANDLE_MULTIBYTE
607 int ret;
608 size_t n;
609 wchar_t *wpattern, *wstring;
610 size_t plen, slen, mplen, mslen;
611
612 if (MB_CUR_MAX == 1)
613 return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
614
615 if (mbsmbchar (string) == 0 && mbsmbchar (pattern) == 0 && posix_cclass_only (pattern))
616 return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
617
618 n = xdupmbstowcs (&wpattern, NULL, pattern);
619 if (n == (size_t)-1 || n == (size_t)-2)
620 return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
621
622 n = xdupmbstowcs (&wstring, NULL, string);
623 if (n == (size_t)-1 || n == (size_t)-2)
624 {
625 free (wpattern);
626 return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
627 }
628
629 ret = internal_wstrmatch (wpattern, wstring, flags);
630
631 free (wpattern);
632 free (wstring);
633
634 return ret;
635 #else
636 return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
637 #endif /* !HANDLE_MULTIBYTE */
638 }