lib/glob/smatch.c

   1 /* strmatch.c -- ksh-like extended pattern matching for the shell and filename
   2                 globbing. */
   3
   4 /* Copyright (C) 1991-2021 Free Software Foundation, Inc.
   5
   6    This file is part of GNU Bash, the Bourne Again SHell.
   7
   8    Bash is free software: you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation, either version 3 of the License, or
  11    (at your option) any later version.
  12
  13    Bash is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with Bash.  If not, see <http://www.gnu.org/licenses/>.
  20 */
  21
  22 #include <config.h>
  23
  24 #include <stdio.h>      /* for debugging */
  25
  26 #include "strmatch.h"
  27 #include <chartypes.h>
  28
  29 #include "bashansi.h"
  30 #include "shmbutil.h"
  31 #include "xmalloc.h"
  32
  33 #include <errno.h>
  34
  35 #if !defined (errno)
  36 extern int errno;
  37 #endif
  38
  39 #if FNMATCH_EQUIV_FALLBACK
  40 /* We don't include <fnmatch.h> in order to avoid namespace collisions; the
  41    internal strmatch still uses the FNM_ constants. */
  42 extern int fnmatch (const char *, const char *, int);
  43 #endif
  44
  45 /* First, compile `sm_loop.c' for single-byte characters. */
  46 #define CHAR    unsigned char
  47 #define U_CHAR  unsigned char
  48 #define XCHAR   char
  49 #define INT     int
  50 #define L(CS)   CS
  51 #define INVALID -1
  52
  53 #undef STREQ
  54 #undef STREQN
  55 #define STREQ(a, b) ((a)[0] == (b)[0] && strcmp(a, b) == 0)
  56 #define STREQN(a, b, n) ((a)[0] == (b)[0] && strncmp(a, b, n) == 0)
  57
  58 #ifndef GLOBASCII_DEFAULT
  59 #  define GLOBASCII_DEFAULT 0
  60 #endif
  61
  62 int glob_asciirange = GLOBASCII_DEFAULT;
  63
  64 #if FNMATCH_EQUIV_FALLBACK
  65 /* Construct a string w1 = "c1" and a pattern w2 = "[[=c2=]]" and pass them
  66    to fnmatch to see if wide characters c1 and c2 collate as members of the
  67    same equivalence class. We can't really do this portably any other way */
  68 static int
  69 _fnmatch_fallback (s, p)
  70      int s, p;                  /* string char, patchar */
  71 {
  72   char s1[2];                   /* string */
  73   char s2[8];                   /* constructed pattern */
  74
  75   s1[0] = (unsigned char)s;
  76   s1[1] = '\0';
  77
  78   /* reconstruct the pattern */
  79   s2[0] = s2[1] = '[';
  80   s2[2] = '=';
  81   s2[3] = (unsigned char)p;
  82   s2[4] = '=';
  83   s2[5] = s2[6] = ']';
  84   s2[7] = '\0';
  85
  86   return (fnmatch ((const char *)s2, (const char *)s1, 0));
  87 }
  88 #endif
  89
  90 /* We use strcoll(3) for range comparisons in bracket expressions,
  91    even though it can have unwanted side effects in locales
  92    other than POSIX or US.  For instance, in the de locale, [A-Z] matches
  93    all characters.  If GLOB_ASCIIRANGE is non-zero, and we're not forcing
  94    the use of strcoll (e.g., for explicit collating symbols), we use
  95    straight ordering as if in the C locale. */
  96
  97 #if defined (HAVE_STRCOLL)
  98 /* Helper functions for collating symbol equivalence. */
  99
 100 /* Return 0 if C1 == C2 or collates equally if FORCECOLL is non-zero. */
 101 static int
 102 charcmp (c1, c2, forcecoll)
 103      int c1, c2;
 104      int forcecoll;
 105 {
 106   static char s1[2] = { ' ', '\0' };
 107   static char s2[2] = { ' ', '\0' };
 108   int ret;
 109
 110   /* Eight bits only.  Period. */
 111   c1 &= 0xFF;
 112   c2 &= 0xFF;
 113
 114   if (c1 == c2)
 115     return (0);
 116
 117   if (forcecoll == 0 && glob_asciirange)
 118     return (c1 - c2);
 119
 120   s1[0] = c1;
 121   s2[0] = c2;
 122
 123   return (strcoll (s1, s2));
 124 }
 125
 126 static int
 127 rangecmp (c1, c2, forcecoll)
 128      int c1, c2;
 129      int forcecoll;
 130 {
 131   int r;
 132
 133   r = charcmp (c1, c2, forcecoll);
 134
 135   /* We impose a total ordering here by returning c1-c2 if charcmp returns 0 */
 136   if (r != 0)
 137     return r;
 138   return (c1 - c2);             /* impose total ordering */
 139 }
 140 #else /* !HAVE_STRCOLL */
 141 #  define rangecmp(c1, c2, f)   ((int)(c1) - (int)(c2))
 142 #endif /* !HAVE_STRCOLL */
 143
 144 #if defined (HAVE_STRCOLL)
 145 /* Returns 1 if chars C and EQUIV collate equally in the current locale. */
 146 static int
 147 collequiv (c, equiv)
 148      int c, equiv;
 149 {
 150   if (charcmp (c, equiv, 1) == 0)
 151     return 1;
 152
 153 #if FNMATCH_EQUIV_FALLBACK
 154   return (_fnmatch_fallback (c, equiv) == 0);
 155 #else
 156   return 0;
 157 #endif
 158
 159 }
 160 #else
 161 #  define collequiv(c, equiv)   ((c) == (equiv))
 162 #endif
 163
 164 #define _COLLSYM        _collsym
 165 #define __COLLSYM       __collsym
 166 #define POSIXCOLL       posix_collsyms
 167 #include "collsyms.h"
 168
 169 static int
 170 collsym (s, len)
 171      CHAR *s;
 172      int len;
 173 {
 174   register struct _collsym *csp;
 175   char *x;
 176
 177   x = (char *)s;
 178   for (csp = posix_collsyms; csp->name; csp++)
 179     {
 180       if (STREQN(csp->name, x, len) && csp->name[len] == '\0')
 181         return (csp->code);
 182     }
 183   if (len == 1)
 184     return s[0];
 185   return INVALID;
 186 }
 187
 188 /* unibyte character classification */
 189 #if !defined (isascii) && !defined (HAVE_ISASCII)
 190 #  define isascii(c)    ((unsigned int)(c) <= 0177)
 191 #endif
 192
 193 enum char_class
 194   {
 195     CC_NO_CLASS = 0,
 196     CC_ASCII, CC_ALNUM, CC_ALPHA, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
 197     CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_WORD, CC_XDIGIT
 198   };
 199
 200 static char const *const cclass_name[] =
 201   {
 202     "",
 203     "ascii", "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 204     "lower", "print", "punct", "space", "upper", "word", "xdigit"
 205   };
 206
 207 #define N_CHAR_CLASS (sizeof(cclass_name) / sizeof (cclass_name[0]))
 208
 209 static enum char_class
 210 is_valid_cclass (name)
 211      const char *name;
 212 {
 213   enum char_class ret;
 214   int i;
 215
 216   ret = CC_NO_CLASS;
 217
 218   for (i = 1; i < N_CHAR_CLASS; i++)
 219     {
 220       if (STREQ (name, cclass_name[i]))
 221         {
 222           ret = (enum char_class)i;
 223           break;
 224         }
 225     }
 226
 227   return ret;
 228 }
 229
 230 static int
 231 cclass_test (c, char_class)
 232      int c;
 233      enum char_class char_class;
 234 {
 235   int result;
 236
 237   switch (char_class)
 238     {
 239       case CC_ASCII:
 240         result = isascii (c);
 241         break;
 242       case CC_ALNUM:
 243         result = ISALNUM (c);
 244         break;
 245       case CC_ALPHA:
 246         result = ISALPHA (c);
 247         break;
 248       case CC_BLANK:
 249         result = ISBLANK (c);
 250         break;
 251       case CC_CNTRL:
 252         result = ISCNTRL (c);
 253         break;
 254       case CC_DIGIT:
 255         result = ISDIGIT (c);
 256         break;
 257       case CC_GRAPH:
 258         result = ISGRAPH (c);
 259         break;
 260       case CC_LOWER:
 261         result = ISLOWER (c);
 262         break;
 263       case CC_PRINT:
 264         result = ISPRINT (c);
 265         break;
 266       case CC_PUNCT:
 267         result = ISPUNCT (c);
 268         break;
 269       case CC_SPACE:
 270         result = ISSPACE (c);
 271         break;
 272       case CC_UPPER:
 273         result = ISUPPER (c);
 274         break;
 275       case CC_WORD:
 276         result = (ISALNUM (c) || c == '_');
 277         break;
 278       case CC_XDIGIT:
 279         result = ISXDIGIT (c);
 280         break;
 281       default:
 282         result = -1;
 283         break;
 284     }
 285
 286   return result;
 287 }
 288
 289 static int
 290 is_cclass (c, name)
 291      int c;
 292      const char *name;
 293 {
 294   enum char_class char_class;
 295   int result;
 296
 297   char_class = is_valid_cclass (name);
 298   if (char_class == CC_NO_CLASS)
 299     return -1;
 300
 301   result = cclass_test (c, char_class);
 302   return (result);
 303 }
 304
 305 /* Now include `sm_loop.c' for single-byte characters. */
 306 /* The result of FOLD is an `unsigned char' */
 307 # define FOLD(c) ((flags & FNM_CASEFOLD) \
 308         ? TOLOWER ((unsigned char)c) \
 309         : ((unsigned char)c))
 310
 311 #if !defined (__CYGWIN__)
 312 #  define ISDIRSEP(c)   ((c) == '/')
 313 #else
 314 #  define ISDIRSEP(c)   ((c) == '/' || (c) == '\\')
 315 #endif /* __CYGWIN__ */
 316 #define PATHSEP(c)      (ISDIRSEP(c) || (c) == 0)
 317
 318 #  define PDOT_OR_DOTDOT(s)     (s[0] == '.' && (PATHSEP (s[1]) || (s[1] == '.' && PATHSEP (s[2]))))
 319 #  define SDOT_OR_DOTDOT(s)     (s[0] == '.' && (s[1] == 0 || (s[1] == '.' && s[2] == 0)))
 320
 321 #define FCT                     internal_strmatch
 322 #define GMATCH                  gmatch
 323 #define COLLSYM                 collsym
 324 #define PARSE_COLLSYM           parse_collsym
 325 #define BRACKMATCH              brackmatch
 326 #define PATSCAN                 glob_patscan
 327 #define STRCOMPARE              strcompare
 328 #define EXTMATCH                extmatch
 329 #define DEQUOTE_PATHNAME        udequote_pathname
 330 #define STRUCT                  smat_struct
 331 #define STRCHR(S, C)            strchr((S), (C))
 332 #define MEMCHR(S, C, N)         memchr((S), (C), (N))
 333 #define STRCOLL(S1, S2)         strcoll((S1), (S2))
 334 #define STRLEN(S)               strlen(S)
 335 #define STRCMP(S1, S2)          strcmp((S1), (S2))
 336 #define RANGECMP(C1, C2, F)     rangecmp((C1), (C2), (F))
 337 #define COLLEQUIV(C1, C2)       collequiv((C1), (C2))
 338 #define CTYPE_T                 enum char_class
 339 #define IS_CCLASS(C, S)         is_cclass((C), (S))
 340 #include "sm_loop.c"
 341
 342 #if HANDLE_MULTIBYTE
 343
 344 #  define CHAR          wchar_t
 345 #  define U_CHAR        wint_t
 346 #  define XCHAR         wchar_t
 347 #  define INT           wint_t
 348 #  define L(CS)         L##CS
 349 #  define INVALID       WEOF
 350
 351 #  undef STREQ
 352 #  undef STREQN
 353 #  define STREQ(s1, s2) ((wcscmp (s1, s2) == 0))
 354 #  define STREQN(a, b, n) ((a)[0] == (b)[0] && wcsncmp(a, b, n) == 0)
 355
 356 extern char *mbsmbchar PARAMS((const char *));
 357
 358 #if FNMATCH_EQUIV_FALLBACK
 359 /* Construct a string w1 = "c1" and a pattern w2 = "[[=c2=]]" and pass them
 360    to fnmatch to see if wide characters c1 and c2 collate as members of the
 361    same equivalence class. We can't really do this portably any other way */
 362 static int
 363 _fnmatch_fallback_wc (c1, c2)
 364      wchar_t c1, c2;                    /* string char, patchar */
 365 {
 366   char w1[MB_LEN_MAX+1];                /* string */
 367   char w2[MB_LEN_MAX+8];                /* constructed pattern */
 368   int l1, l2;
 369
 370   l1 = wctomb (w1, c1);
 371   if (l1 == -1)
 372     return (2);
 373   w1[l1] = '\0';
 374
 375   /* reconstruct the pattern */
 376   w2[0] = w2[1] = '[';
 377   w2[2] = '=';
 378   l2 = wctomb (w2+3, c2);
 379   if (l2 == -1)
 380     return (2);
 381   w2[l2+3] = '=';
 382   w2[l2+4] = w2[l2+5] = ']';
 383   w2[l2+6] = '\0';
 384
 385   return (fnmatch ((const char *)w2, (const char *)w1, 0));
 386 }
 387 #endif
 388
 389 static int
 390 charcmp_wc (c1, c2, forcecoll)
 391      wint_t c1, c2;
 392      int forcecoll;
 393 {
 394   static wchar_t s1[2] = { L' ', L'\0' };
 395   static wchar_t s2[2] = { L' ', L'\0' };
 396   int r;
 397
 398   if (c1 == c2)
 399     return 0;
 400
 401   if (forcecoll == 0 && glob_asciirange && c1 <= UCHAR_MAX && c2 <= UCHAR_MAX)
 402     return ((int)(c1 - c2));
 403
 404   s1[0] = c1;
 405   s2[0] = c2;
 406
 407   return (wcscoll (s1, s2));
 408 }
 409
 410 static int
 411 rangecmp_wc (c1, c2, forcecoll)
 412      wint_t c1, c2;
 413      int forcecoll;
 414 {
 415   int r;
 416
 417   r = charcmp_wc (c1, c2, forcecoll);
 418
 419   /* We impose a total ordering here by returning c1-c2 if charcmp returns 0,
 420      as we do above in the single-byte case. */
 421   if (r != 0 || forcecoll)
 422     return r;
 423   return ((int)(c1 - c2));              /* impose total ordering */
 424 }
 425
 426 /* Returns 1 if wide chars C and EQUIV collate equally in the current locale. */
 427 static int
 428 collequiv_wc (c, equiv)
 429      wint_t c, equiv;
 430 {
 431   wchar_t s, p;
 432
 433   if (charcmp_wc (c, equiv, 1) == 0)
 434     return 1;
 435
 436 #if FNMATCH_EQUIV_FALLBACK
 437 /* We check explicitly for success (fnmatch returns 0) to avoid problems if
 438    our local definition of FNM_NOMATCH (strmatch.h) doesn't match the
 439    system's (fnmatch.h). We don't care about error return values here. */
 440
 441   s = c;
 442   p = equiv;
 443   return (_fnmatch_fallback_wc (s, p) == 0);
 444 #else
 445   return 0;
 446 #endif
 447 }
 448
 449 /* Helper function for collating symbol. */
 450 #  define _COLLSYM      _collwcsym
 451 #  define __COLLSYM     __collwcsym
 452 #  define POSIXCOLL     posix_collwcsyms
 453 #  include "collsyms.h"
 454
 455 static wint_t
 456 collwcsym (s, len)
 457      wchar_t *s;
 458      int len;
 459 {
 460   register struct _collwcsym *csp;
 461
 462   for (csp = posix_collwcsyms; csp->name; csp++)
 463     {
 464       if (STREQN(csp->name, s, len) && csp->name[len] == L'\0')
 465         return (csp->code);
 466     }
 467   if (len == 1)
 468     return s[0];
 469   return INVALID;
 470 }
 471
 472 static int
 473 is_wcclass (wc, name)
 474      wint_t wc;
 475      wchar_t *name;
 476 {
 477   char *mbs;
 478   mbstate_t state;
 479   size_t mbslength;
 480   wctype_t desc;
 481   int want_word;
 482
 483   if ((wctype ("ascii") == (wctype_t)0) && (wcscmp (name, L"ascii") == 0))
 484     {
 485       int c;
 486
 487       if ((c = wctob (wc)) == EOF)
 488         return 0;
 489       else
 490         return (c <= 0x7F);
 491     }
 492
 493   want_word = (wcscmp (name, L"word") == 0);
 494   if (want_word)
 495     name = L"alnum";
 496
 497   memset (&state, '\0', sizeof (mbstate_t));
 498   mbs = (char *) malloc (wcslen(name) * MB_CUR_MAX + 1);
 499   if (mbs == 0)
 500     return -1;
 501   mbslength = wcsrtombs (mbs, (const wchar_t **)&name, (wcslen(name) * MB_CUR_MAX + 1), &state);
 502
 503   if (mbslength == (size_t)-1 || mbslength == (size_t)-2)
 504     {
 505       free (mbs);
 506       return -1;
 507     }
 508   desc = wctype (mbs);
 509   free (mbs);
 510
 511   if (desc == (wctype_t)0)
 512     return -1;
 513
 514   if (want_word)
 515     return (iswctype (wc, desc) || wc == L'_');
 516   else
 517     return (iswctype (wc, desc));
 518 }
 519
 520 /* Return 1 if there are no char class [:class:] expressions (degenerate case)
 521    or only posix-specified (C locale supported) char class expressions in
 522    PATTERN.  These are the ones where it's safe to punt to the single-byte
 523    code, since wide character support allows locale-defined char classes.
 524    This only uses single-byte code, but is only needed to support multibyte
 525    locales. */
 526 static int
 527 posix_cclass_only (pattern)
 528      char *pattern;
 529 {
 530   char *p, *p1;
 531   char cc[16];          /* sufficient for all valid posix char class names */
 532   enum char_class valid;
 533
 534   p = pattern;
 535   while (p = strchr (p, '['))
 536     {
 537       if (p[1] != ':')
 538         {
 539           p++;
 540           continue;
 541         }
 542       p += 2;           /* skip past "[:" */
 543       /* Find end of char class expression */
 544       for (p1 = p; *p1;  p1++)
 545         if (*p1 == ':' && p1[1] == ']')
 546           break;
 547       if (*p1 == 0)     /* no char class expression found */
 548         break;
 549       /* Find char class name and validate it against posix char classes */
 550       if ((p1 - p) >= sizeof (cc))
 551         return 0;
 552       bcopy (p, cc, p1 - p);
 553       cc[p1 - p] = '\0';
 554       valid = is_valid_cclass (cc);
 555       if (valid == CC_NO_CLASS)
 556         return 0;               /* found unrecognized char class name */
 557
 558       p = p1 + 2;               /* found posix char class name */
 559     }
 560
 561   return 1;                     /* no char class names or only posix */
 562 }
 563
 564 /* Now include `sm_loop.c' for multibyte characters. */
 565 #define FOLD(c) ((flags & FNM_CASEFOLD) && iswupper (c) ? towlower (c) : (c))
 566
 567 #  if !defined (__CYGWIN__)
 568 #    define ISDIRSEP(c) ((c) == L'/')
 569 #  else
 570 #    define ISDIRSEP(c) ((c) == L'/' || (c) == L'\\')
 571 #  endif /* __CYGWIN__ */
 572 #  define PATHSEP(c)    (ISDIRSEP(c) || (c) == L'\0')
 573
 574 #  define PDOT_OR_DOTDOT(w)     (w[0] == L'.' && (PATHSEP(w[1]) || (w[1] == L'.' && PATHSEP(w[2]))))
 575 #  define SDOT_OR_DOTDOT(w)     (w[0] == L'.' && (w[1] == L'\0' || (w[1] == L'.' && w[2] == L'\0')))
 576
 577 #define FCT                     internal_wstrmatch
 578 #define GMATCH                  gmatch_wc
 579 #define COLLSYM                 collwcsym
 580 #define PARSE_COLLSYM           parse_collwcsym
 581 #define BRACKMATCH              brackmatch_wc
 582 #define PATSCAN                 glob_patscan_wc
 583 #define STRCOMPARE              wscompare
 584 #define EXTMATCH                extmatch_wc
 585 #define DEQUOTE_PATHNAME        wcdequote_pathname
 586 #define STRUCT                  wcsmat_struct
 587 #define STRCHR(S, C)            wcschr((S), (C))
 588 #define MEMCHR(S, C, N)         wmemchr((S), (C), (N))
 589 #define STRCOLL(S1, S2)         wcscoll((S1), (S2))
 590 #define STRLEN(S)               wcslen(S)
 591 #define STRCMP(S1, S2)          wcscmp((S1), (S2))
 592 #define RANGECMP(C1, C2, F)     rangecmp_wc((C1), (C2), (F))
 593 #define COLLEQUIV(C1, C2)       collequiv_wc((C1), (C2))
 594 #define CTYPE_T                 enum char_class
 595 #define IS_CCLASS(C, S)         is_wcclass((C), (S))
 596 #include "sm_loop.c"
 597
 598 #endif /* HAVE_MULTIBYTE */
 599
 600 int
 601 xstrmatch (pattern, string, flags)
 602      char *pattern;
 603      char *string;
 604      int flags;
 605 {
 606 #if HANDLE_MULTIBYTE
 607   int ret;
 608   size_t n;
 609   wchar_t *wpattern, *wstring;
 610   size_t plen, slen, mplen, mslen;
 611
 612   if (MB_CUR_MAX == 1)
 613     return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
 614
 615   if (mbsmbchar (string) == 0 && mbsmbchar (pattern) == 0 && posix_cclass_only (pattern))
 616     return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
 617
 618   n = xdupmbstowcs (&wpattern, NULL, pattern);
 619   if (n == (size_t)-1 || n == (size_t)-2)
 620     return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
 621
 622   n = xdupmbstowcs (&wstring, NULL, string);
 623   if (n == (size_t)-1 || n == (size_t)-2)
 624     {
 625       free (wpattern);
 626       return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
 627     }
 628
 629   ret = internal_wstrmatch (wpattern, wstring, flags);
 630
 631   free (wpattern);
 632   free (wstring);
 633
 634   return ret;
 635 #else
 636   return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
 637 #endif /* !HANDLE_MULTIBYTE */
 638 }