posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, bool icase,
  23                                         const re_dfa_t *dfa);
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash);
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash);
  31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
  32                                                 Idx new_buf_len);
  33 #ifdef RE_ENABLE_I18N
  34 static void build_wcs_buffer (re_string_t *pstr);
  35 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
  36 #endif /* RE_ENABLE_I18N */
  37 static void build_upper_buffer (re_string_t *pstr);
  38 static void re_string_translate_buffer (re_string_t *pstr);
  39 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
  40                                           int eflags) __attribute__ ((pure));
  41 \f
  42 /* Functions for string operation.  */
  43
  44 /* This function allocate the buffers.  It is necessary to call
  45    re_string_reconstruct before using the object.  */
  46
  47 static reg_errcode_t
  48 __attribute_warn_unused_result__
  49 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  50                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  51 {
  52   reg_errcode_t ret;
  53   Idx init_buf_len;
  54
  55   /* Ensure at least one character fits into the buffers.  */
  56   if (init_len < dfa->mb_cur_max)
  57     init_len = dfa->mb_cur_max;
  58   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  59   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  60
  61   ret = re_string_realloc_buffers (pstr, init_buf_len);
  62   if (__glibc_unlikely (ret != REG_NOERROR))
  63     return ret;
  64
  65   pstr->word_char = dfa->word_char;
  66   pstr->word_ops_used = dfa->word_ops_used;
  67   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  68   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  69   pstr->valid_raw_len = pstr->valid_len;
  70   return REG_NOERROR;
  71 }
  72
  73 /* This function allocate the buffers, and initialize them.  */
  74
  75 static reg_errcode_t
  76 __attribute_warn_unused_result__
  77 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  78                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  79 {
  80   reg_errcode_t ret;
  81   memset (pstr, '\0', sizeof (re_string_t));
  82   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  83
  84   if (len > 0)
  85     {
  86       ret = re_string_realloc_buffers (pstr, len + 1);
  87       if (__glibc_unlikely (ret != REG_NOERROR))
  88         return ret;
  89     }
  90   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  91
  92   if (icase)
  93     {
  94 #ifdef RE_ENABLE_I18N
  95       if (dfa->mb_cur_max > 1)
  96         {
  97           while (1)
  98             {
  99               ret = build_wcs_upper_buffer (pstr);
 100               if (__glibc_unlikely (ret != REG_NOERROR))
 101                 return ret;
 102               if (pstr->valid_raw_len >= len)
 103                 break;
 104               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 105                 break;
 106               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 107               if (__glibc_unlikely (ret != REG_NOERROR))
 108                 return ret;
 109             }
 110         }
 111       else
 112 #endif /* RE_ENABLE_I18N  */
 113         build_upper_buffer (pstr);
 114     }
 115   else
 116     {
 117 #ifdef RE_ENABLE_I18N
 118       if (dfa->mb_cur_max > 1)
 119         build_wcs_buffer (pstr);
 120       else
 121 #endif /* RE_ENABLE_I18N  */
 122         {
 123           if (trans != NULL)
 124             re_string_translate_buffer (pstr);
 125           else
 126             {
 127               pstr->valid_len = pstr->bufs_len;
 128               pstr->valid_raw_len = pstr->bufs_len;
 129             }
 130         }
 131     }
 132
 133   return REG_NOERROR;
 134 }
 135
 136 /* Helper functions for re_string_allocate, and re_string_construct.  */
 137
 138 static reg_errcode_t
 139 __attribute_warn_unused_result__
 140 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 141 {
 142 #ifdef RE_ENABLE_I18N
 143   if (pstr->mb_cur_max > 1)
 144     {
 145       wint_t *new_wcs;
 146
 147       /* Avoid overflow in realloc.  */
 148       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 149       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
 150                             < new_buf_len))
 151         return REG_ESPACE;
 152
 153       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 154       if (__glibc_unlikely (new_wcs == NULL))
 155         return REG_ESPACE;
 156       pstr->wcs = new_wcs;
 157       if (pstr->offsets != NULL)
 158         {
 159           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 160           if (__glibc_unlikely (new_offsets == NULL))
 161             return REG_ESPACE;
 162           pstr->offsets = new_offsets;
 163         }
 164     }
 165 #endif /* RE_ENABLE_I18N  */
 166   if (pstr->mbs_allocated)
 167     {
 168       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 169                                            new_buf_len);
 170       if (__glibc_unlikely (new_mbs == NULL))
 171         return REG_ESPACE;
 172       pstr->mbs = new_mbs;
 173     }
 174   pstr->bufs_len = new_buf_len;
 175   return REG_NOERROR;
 176 }
 177
 178
 179 static void
 180 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 181                             RE_TRANSLATE_TYPE trans, bool icase,
 182                             const re_dfa_t *dfa)
 183 {
 184   pstr->raw_mbs = (const unsigned char *) str;
 185   pstr->len = len;
 186   pstr->raw_len = len;
 187   pstr->trans = trans;
 188   pstr->icase = icase;
 189   pstr->mbs_allocated = (trans != NULL || icase);
 190   pstr->mb_cur_max = dfa->mb_cur_max;
 191   pstr->is_utf8 = dfa->is_utf8;
 192   pstr->map_notascii = dfa->map_notascii;
 193   pstr->stop = pstr->len;
 194   pstr->raw_stop = pstr->stop;
 195 }
 196
 197 #ifdef RE_ENABLE_I18N
 198
 199 /* Build wide character buffer PSTR->WCS.
 200    If the byte sequence of the string are:
 201      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 202    Then wide character buffer will be:
 203      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 204    We use WEOF for padding, they indicate that the position isn't
 205    a first byte of a multibyte character.
 206
 207    Note that this function assumes PSTR->VALID_LEN elements are already
 208    built and starts from PSTR->VALID_LEN.  */
 209
 210 static void
 211 build_wcs_buffer (re_string_t *pstr)
 212 {
 213 #ifdef _LIBC
 214   unsigned char buf[MB_LEN_MAX];
 215   DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
 216 #else
 217   unsigned char buf[64];
 218 #endif
 219   mbstate_t prev_st;
 220   Idx byte_idx, end_idx, remain_len;
 221   size_t mbclen;
 222
 223   /* Build the buffers from pstr->valid_len to either pstr->len or
 224      pstr->bufs_len.  */
 225   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 226   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 227     {
 228       wchar_t wc;
 229       const char *p;
 230
 231       remain_len = end_idx - byte_idx;
 232       prev_st = pstr->cur_state;
 233       /* Apply the translation if we need.  */
 234       if (__glibc_unlikely (pstr->trans != NULL))
 235         {
 236           int i, ch;
 237
 238           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 239             {
 240               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 241               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 242             }
 243           p = (const char *) buf;
 244         }
 245       else
 246         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 247       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 248       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
 249                             || (mbclen == (size_t) -2
 250                                 && pstr->bufs_len >= pstr->len)))
 251         {
 252           /* We treat these cases as a singlebyte character.  */
 253           mbclen = 1;
 254           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 255           if (__glibc_unlikely (pstr->trans != NULL))
 256             wc = pstr->trans[wc];
 257           pstr->cur_state = prev_st;
 258         }
 259       else if (__glibc_unlikely (mbclen == (size_t) -2))
 260         {
 261           /* The buffer doesn't have enough space, finish to build.  */
 262           pstr->cur_state = prev_st;
 263           break;
 264         }
 265
 266       /* Write wide character and padding.  */
 267       pstr->wcs[byte_idx++] = wc;
 268       /* Write paddings.  */
 269       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 270         pstr->wcs[byte_idx++] = WEOF;
 271     }
 272   pstr->valid_len = byte_idx;
 273   pstr->valid_raw_len = byte_idx;
 274 }
 275
 276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 277    but for REG_ICASE.  */
 278
 279 static reg_errcode_t
 280 __attribute_warn_unused_result__
 281 build_wcs_upper_buffer (re_string_t *pstr)
 282 {
 283   mbstate_t prev_st;
 284   Idx src_idx, byte_idx, end_idx, remain_len;
 285   size_t mbclen;
 286 #ifdef _LIBC
 287   char buf[MB_LEN_MAX];
 288   DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
 289 #else
 290   char buf[64];
 291 #endif
 292
 293   byte_idx = pstr->valid_len;
 294   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 295
 296   /* The following optimization assumes that ASCII characters can be
 297      mapped to wide characters with a simple cast.  */
 298   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 299     {
 300       while (byte_idx < end_idx)
 301         {
 302           wchar_t wc;
 303           unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 304
 305           if (isascii (ch) && mbsinit (&pstr->cur_state))
 306             {
 307               /* The next step uses the assumption that wchar_t is encoded
 308                  ASCII-safe: all ASCII values can be converted like this.  */
 309               wchar_t wcu = __towupper (ch);
 310               if (isascii (wcu))
 311                 {
 312                   pstr->mbs[byte_idx] = wcu;
 313                   pstr->wcs[byte_idx] = wcu;
 314                   byte_idx++;
 315                   continue;
 316                 }
 317             }
 318
 319           remain_len = end_idx - byte_idx;
 320           prev_st = pstr->cur_state;
 321           mbclen = __mbrtowc (&wc,
 322                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 323                                + byte_idx), remain_len, &pstr->cur_state);
 324           if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 325             {
 326               wchar_t wcu = __towupper (wc);
 327               if (wcu != wc)
 328                 {
 329                   size_t mbcdlen;
 330
 331                   mbcdlen = __wcrtomb (buf, wcu, &prev_st);
 332                   if (__glibc_likely (mbclen == mbcdlen))
 333                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 334                   else
 335                     {
 336                       src_idx = byte_idx;
 337                       goto offsets_needed;
 338                     }
 339                 }
 340               else
 341                 memcpy (pstr->mbs + byte_idx,
 342                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 343               pstr->wcs[byte_idx++] = wcu;
 344               /* Write paddings.  */
 345               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 346                 pstr->wcs[byte_idx++] = WEOF;
 347             }
 348           else if (mbclen == (size_t) -1 || mbclen == 0
 349                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 350             {
 351               /* It is an invalid character, an incomplete character
 352                  at the end of the string, or '\0'.  Just use the byte.  */
 353               pstr->mbs[byte_idx] = ch;
 354               /* And also cast it to wide char.  */
 355               pstr->wcs[byte_idx++] = (wchar_t) ch;
 356               if (__glibc_unlikely (mbclen == (size_t) -1))
 357                 pstr->cur_state = prev_st;
 358             }
 359           else
 360             {
 361               /* The buffer doesn't have enough space, finish to build.  */
 362               pstr->cur_state = prev_st;
 363               break;
 364             }
 365         }
 366       pstr->valid_len = byte_idx;
 367       pstr->valid_raw_len = byte_idx;
 368       return REG_NOERROR;
 369     }
 370   else
 371     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 372       {
 373         wchar_t wc;
 374         const char *p;
 375       offsets_needed:
 376         remain_len = end_idx - byte_idx;
 377         prev_st = pstr->cur_state;
 378         if (__glibc_unlikely (pstr->trans != NULL))
 379           {
 380             int i, ch;
 381
 382             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 383               {
 384                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 385                 buf[i] = pstr->trans[ch];
 386               }
 387             p = (const char *) buf;
 388           }
 389         else
 390           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 391         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 392         if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 393           {
 394             wchar_t wcu = __towupper (wc);
 395             if (wcu != wc)
 396               {
 397                 size_t mbcdlen;
 398
 399                 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
 400                 if (__glibc_likely (mbclen == mbcdlen))
 401                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 402                 else if (mbcdlen != (size_t) -1)
 403                   {
 404                     size_t i;
 405
 406                     if (byte_idx + mbcdlen > pstr->bufs_len)
 407                       {
 408                         pstr->cur_state = prev_st;
 409                         break;
 410                       }
 411
 412                     if (pstr->offsets == NULL)
 413                       {
 414                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 415
 416                         if (pstr->offsets == NULL)
 417                           return REG_ESPACE;
 418                       }
 419                     if (!pstr->offsets_needed)
 420                       {
 421                         for (i = 0; i < (size_t) byte_idx; ++i)
 422                           pstr->offsets[i] = i;
 423                         pstr->offsets_needed = 1;
 424                       }
 425
 426                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 427                     pstr->wcs[byte_idx] = wcu;
 428                     pstr->offsets[byte_idx] = src_idx;
 429                     for (i = 1; i < mbcdlen; ++i)
 430                       {
 431                         pstr->offsets[byte_idx + i]
 432                           = src_idx + (i < mbclen ? i : mbclen - 1);
 433                         pstr->wcs[byte_idx + i] = WEOF;
 434                       }
 435                     pstr->len += mbcdlen - mbclen;
 436                     if (pstr->raw_stop > src_idx)
 437                       pstr->stop += mbcdlen - mbclen;
 438                     end_idx = (pstr->bufs_len > pstr->len)
 439                               ? pstr->len : pstr->bufs_len;
 440                     byte_idx += mbcdlen;
 441                     src_idx += mbclen;
 442                     continue;
 443                   }
 444                 else
 445                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 446               }
 447             else
 448               memcpy (pstr->mbs + byte_idx, p, mbclen);
 449
 450             if (__glibc_unlikely (pstr->offsets_needed != 0))
 451               {
 452                 size_t i;
 453                 for (i = 0; i < mbclen; ++i)
 454                   pstr->offsets[byte_idx + i] = src_idx + i;
 455               }
 456             src_idx += mbclen;
 457
 458             pstr->wcs[byte_idx++] = wcu;
 459             /* Write paddings.  */
 460             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 461               pstr->wcs[byte_idx++] = WEOF;
 462           }
 463         else if (mbclen == (size_t) -1 || mbclen == 0
 464                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 465           {
 466             /* It is an invalid character or '\0'.  Just use the byte.  */
 467             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 468
 469             if (__glibc_unlikely (pstr->trans != NULL))
 470               ch = pstr->trans [ch];
 471             pstr->mbs[byte_idx] = ch;
 472
 473             if (__glibc_unlikely (pstr->offsets_needed != 0))
 474               pstr->offsets[byte_idx] = src_idx;
 475             ++src_idx;
 476
 477             /* And also cast it to wide char.  */
 478             pstr->wcs[byte_idx++] = (wchar_t) ch;
 479             if (__glibc_unlikely (mbclen == (size_t) -1))
 480               pstr->cur_state = prev_st;
 481           }
 482         else
 483           {
 484             /* The buffer doesn't have enough space, finish to build.  */
 485             pstr->cur_state = prev_st;
 486             break;
 487           }
 488       }
 489   pstr->valid_len = byte_idx;
 490   pstr->valid_raw_len = src_idx;
 491   return REG_NOERROR;
 492 }
 493
 494 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 495    Return the index.  */
 496
 497 static Idx
 498 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 499 {
 500   mbstate_t prev_st;
 501   Idx rawbuf_idx;
 502   size_t mbclen;
 503   wint_t wc = WEOF;
 504
 505   /* Skip the characters which are not necessary to check.  */
 506   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 507        rawbuf_idx < new_raw_idx;)
 508     {
 509       wchar_t wc2;
 510       Idx remain_len = pstr->raw_len - rawbuf_idx;
 511       prev_st = pstr->cur_state;
 512       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 513                           remain_len, &pstr->cur_state);
 514       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
 515                             || mbclen == 0))
 516         {
 517           /* We treat these cases as a single byte character.  */
 518           if (mbclen == 0 || remain_len == 0)
 519             wc = L'\0';
 520           else
 521             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 522           mbclen = 1;
 523           pstr->cur_state = prev_st;
 524         }
 525       else
 526         wc = wc2;
 527       /* Then proceed the next character.  */
 528       rawbuf_idx += mbclen;
 529     }
 530   *last_wc = wc;
 531   return rawbuf_idx;
 532 }
 533 #endif /* RE_ENABLE_I18N  */
 534
 535 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 536    This function is used in case of REG_ICASE.  */
 537
 538 static void
 539 build_upper_buffer (re_string_t *pstr)
 540 {
 541   Idx char_idx, end_idx;
 542   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 543
 544   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 545     {
 546       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 547       if (__glibc_unlikely (pstr->trans != NULL))
 548         ch = pstr->trans[ch];
 549       pstr->mbs[char_idx] = toupper (ch);
 550     }
 551   pstr->valid_len = char_idx;
 552   pstr->valid_raw_len = char_idx;
 553 }
 554
 555 /* Apply TRANS to the buffer in PSTR.  */
 556
 557 static void
 558 re_string_translate_buffer (re_string_t *pstr)
 559 {
 560   Idx buf_idx, end_idx;
 561   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 562
 563   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 564     {
 565       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 566       pstr->mbs[buf_idx] = pstr->trans[ch];
 567     }
 568
 569   pstr->valid_len = buf_idx;
 570   pstr->valid_raw_len = buf_idx;
 571 }
 572
 573 /* This function re-construct the buffers.
 574    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 575    convert to upper case in case of REG_ICASE, apply translation.  */
 576
 577 static reg_errcode_t
 578 __attribute_warn_unused_result__
 579 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 580 {
 581   Idx offset;
 582
 583   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
 584     offset = idx - pstr->raw_mbs_idx;
 585   else
 586     {
 587       /* Reset buffer.  */
 588 #ifdef RE_ENABLE_I18N
 589       if (pstr->mb_cur_max > 1)
 590         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 591 #endif /* RE_ENABLE_I18N */
 592       pstr->len = pstr->raw_len;
 593       pstr->stop = pstr->raw_stop;
 594       pstr->valid_len = 0;
 595       pstr->raw_mbs_idx = 0;
 596       pstr->valid_raw_len = 0;
 597       pstr->offsets_needed = 0;
 598       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 599                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 600       if (!pstr->mbs_allocated)
 601         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 602       offset = idx;
 603     }
 604
 605   if (__glibc_likely (offset != 0))
 606     {
 607       /* Should the already checked characters be kept?  */
 608       if (__glibc_likely (offset < pstr->valid_raw_len))
 609         {
 610           /* Yes, move them to the front of the buffer.  */
 611 #ifdef RE_ENABLE_I18N
 612           if (__glibc_unlikely (pstr->offsets_needed))
 613             {
 614               Idx low = 0, high = pstr->valid_len, mid;
 615               do
 616                 {
 617                   mid = (high + low) / 2;
 618                   if (pstr->offsets[mid] > offset)
 619                     high = mid;
 620                   else if (pstr->offsets[mid] < offset)
 621                     low = mid + 1;
 622                   else
 623                     break;
 624                 }
 625               while (low < high);
 626               if (pstr->offsets[mid] < offset)
 627                 ++mid;
 628               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 629                                                         eflags);
 630               /* This can be quite complicated, so handle specially
 631                  only the common and easy case where the character with
 632                  different length representation of lower and upper
 633                  case is present at or after offset.  */
 634               if (pstr->valid_len > offset
 635                   && mid == offset && pstr->offsets[mid] == offset)
 636                 {
 637                   memmove (pstr->wcs, pstr->wcs + offset,
 638                            (pstr->valid_len - offset) * sizeof (wint_t));
 639                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 640                   pstr->valid_len -= offset;
 641                   pstr->valid_raw_len -= offset;
 642                   for (low = 0; low < pstr->valid_len; low++)
 643                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 644                 }
 645               else
 646                 {
 647                   /* Otherwise, just find out how long the partial multibyte
 648                      character at offset is and fill it with WEOF/255.  */
 649                   pstr->len = pstr->raw_len - idx + offset;
 650                   pstr->stop = pstr->raw_stop - idx + offset;
 651                   pstr->offsets_needed = 0;
 652                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 653                     --mid;
 654                   while (mid < pstr->valid_len)
 655                     if (pstr->wcs[mid] != WEOF)
 656                       break;
 657                     else
 658                       ++mid;
 659                   if (mid == pstr->valid_len)
 660                     pstr->valid_len = 0;
 661                   else
 662                     {
 663                       pstr->valid_len = pstr->offsets[mid] - offset;
 664                       if (pstr->valid_len)
 665                         {
 666                           for (low = 0; low < pstr->valid_len; ++low)
 667                             pstr->wcs[low] = WEOF;
 668                           memset (pstr->mbs, 255, pstr->valid_len);
 669                         }
 670                     }
 671                   pstr->valid_raw_len = pstr->valid_len;
 672                 }
 673             }
 674           else
 675 #endif
 676             {
 677               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 678                                                         eflags);
 679 #ifdef RE_ENABLE_I18N
 680               if (pstr->mb_cur_max > 1)
 681                 memmove (pstr->wcs, pstr->wcs + offset,
 682                          (pstr->valid_len - offset) * sizeof (wint_t));
 683 #endif /* RE_ENABLE_I18N */
 684               if (__glibc_unlikely (pstr->mbs_allocated))
 685                 memmove (pstr->mbs, pstr->mbs + offset,
 686                          pstr->valid_len - offset);
 687               pstr->valid_len -= offset;
 688               pstr->valid_raw_len -= offset;
 689               DEBUG_ASSERT (pstr->valid_len > 0);
 690             }
 691         }
 692       else
 693         {
 694 #ifdef RE_ENABLE_I18N
 695           /* No, skip all characters until IDX.  */
 696           Idx prev_valid_len = pstr->valid_len;
 697
 698           if (__glibc_unlikely (pstr->offsets_needed))
 699             {
 700               pstr->len = pstr->raw_len - idx + offset;
 701               pstr->stop = pstr->raw_stop - idx + offset;
 702               pstr->offsets_needed = 0;
 703             }
 704 #endif
 705           pstr->valid_len = 0;
 706 #ifdef RE_ENABLE_I18N
 707           if (pstr->mb_cur_max > 1)
 708             {
 709               Idx wcs_idx;
 710               wint_t wc = WEOF;
 711
 712               if (pstr->is_utf8)
 713                 {
 714                   const unsigned char *raw, *p, *end;
 715
 716                   /* Special case UTF-8.  Multi-byte chars start with any
 717                      byte other than 0x80 - 0xbf.  */
 718                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 719                   end = raw + (offset - pstr->mb_cur_max);
 720                   if (end < pstr->raw_mbs)
 721                     end = pstr->raw_mbs;
 722                   p = raw + offset - 1;
 723 #ifdef _LIBC
 724                   /* We know the wchar_t encoding is UCS4, so for the simple
 725                      case, ASCII characters, skip the conversion step.  */
 726                   if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
 727                     {
 728                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 729                       /* pstr->valid_len = 0; */
 730                       wc = (wchar_t) *p;
 731                     }
 732                   else
 733 #endif
 734                     for (; p >= end; --p)
 735                       if ((*p & 0xc0) != 0x80)
 736                         {
 737                           mbstate_t cur_state;
 738                           wchar_t wc2;
 739                           Idx mlen = raw + pstr->len - p;
 740                           unsigned char buf[6];
 741                           size_t mbclen;
 742
 743                           const unsigned char *pp = p;
 744                           if (__glibc_unlikely (pstr->trans != NULL))
 745                             {
 746                               int i = mlen < 6 ? mlen : 6;
 747                               while (--i >= 0)
 748                                 buf[i] = pstr->trans[p[i]];
 749                               pp = buf;
 750                             }
 751                           /* XXX Don't use mbrtowc, we know which conversion
 752                              to use (UTF-8 -> UCS4).  */
 753                           memset (&cur_state, 0, sizeof (cur_state));
 754                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
 755                                               &cur_state);
 756                           if (raw + offset - p <= mbclen
 757                               && mbclen < (size_t) -2)
 758                             {
 759                               memset (&pstr->cur_state, '\0',
 760                                       sizeof (mbstate_t));
 761                               pstr->valid_len = mbclen - (raw + offset - p);
 762                               wc = wc2;
 763                             }
 764                           break;
 765                         }
 766                 }
 767
 768               if (wc == WEOF)
 769                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 770               if (wc == WEOF)
 771                 pstr->tip_context
 772                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 773               else
 774                 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
 775                                       && IS_WIDE_WORD_CHAR (wc))
 776                                      ? CONTEXT_WORD
 777                                      : ((IS_WIDE_NEWLINE (wc)
 778                                          && pstr->newline_anchor)
 779                                         ? CONTEXT_NEWLINE : 0));
 780               if (__glibc_unlikely (pstr->valid_len))
 781                 {
 782                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 783                     pstr->wcs[wcs_idx] = WEOF;
 784                   if (pstr->mbs_allocated)
 785                     memset (pstr->mbs, 255, pstr->valid_len);
 786                 }
 787               pstr->valid_raw_len = pstr->valid_len;
 788             }
 789           else
 790 #endif /* RE_ENABLE_I18N */
 791             {
 792               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 793               pstr->valid_raw_len = 0;
 794               if (pstr->trans)
 795                 c = pstr->trans[c];
 796               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 797                                    ? CONTEXT_WORD
 798                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 799                                       ? CONTEXT_NEWLINE : 0));
 800             }
 801         }
 802       if (!__glibc_unlikely (pstr->mbs_allocated))
 803         pstr->mbs += offset;
 804     }
 805   pstr->raw_mbs_idx = idx;
 806   pstr->len -= offset;
 807   pstr->stop -= offset;
 808
 809   /* Then build the buffers.  */
 810 #ifdef RE_ENABLE_I18N
 811   if (pstr->mb_cur_max > 1)
 812     {
 813       if (pstr->icase)
 814         {
 815           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 816           if (__glibc_unlikely (ret != REG_NOERROR))
 817             return ret;
 818         }
 819       else
 820         build_wcs_buffer (pstr);
 821     }
 822   else
 823 #endif /* RE_ENABLE_I18N */
 824     if (__glibc_unlikely (pstr->mbs_allocated))
 825       {
 826         if (pstr->icase)
 827           build_upper_buffer (pstr);
 828         else if (pstr->trans != NULL)
 829           re_string_translate_buffer (pstr);
 830       }
 831     else
 832       pstr->valid_len = pstr->len;
 833
 834   pstr->cur_idx = 0;
 835   return REG_NOERROR;
 836 }
 837
 838 static unsigned char
 839 __attribute__ ((pure))
 840 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 841 {
 842   int ch;
 843   Idx off;
 844
 845   /* Handle the common (easiest) cases first.  */
 846   if (__glibc_likely (!pstr->mbs_allocated))
 847     return re_string_peek_byte (pstr, idx);
 848
 849 #ifdef RE_ENABLE_I18N
 850   if (pstr->mb_cur_max > 1
 851       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 852     return re_string_peek_byte (pstr, idx);
 853 #endif
 854
 855   off = pstr->cur_idx + idx;
 856 #ifdef RE_ENABLE_I18N
 857   if (pstr->offsets_needed)
 858     off = pstr->offsets[off];
 859 #endif
 860
 861   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 862
 863 #ifdef RE_ENABLE_I18N
 864   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 865      this function returns CAPITAL LETTER I instead of first byte of
 866      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 867      since peek_byte_case doesn't advance cur_idx in any way.  */
 868   if (pstr->offsets_needed && !isascii (ch))
 869     return re_string_peek_byte (pstr, idx);
 870 #endif
 871
 872   return ch;
 873 }
 874
 875 static unsigned char
 876 re_string_fetch_byte_case (re_string_t *pstr)
 877 {
 878   if (__glibc_likely (!pstr->mbs_allocated))
 879     return re_string_fetch_byte (pstr);
 880
 881 #ifdef RE_ENABLE_I18N
 882   if (pstr->offsets_needed)
 883     {
 884       Idx off;
 885       int ch;
 886
 887       /* For tr_TR.UTF-8 [[:islower:]] there is
 888          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 889          in that case the whole multi-byte character and return
 890          the original letter.  On the other side, with
 891          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 892          anything else would complicate things too much.  */
 893
 894       if (!re_string_first_byte (pstr, pstr->cur_idx))
 895         return re_string_fetch_byte (pstr);
 896
 897       off = pstr->offsets[pstr->cur_idx];
 898       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 899
 900       if (! isascii (ch))
 901         return re_string_fetch_byte (pstr);
 902
 903       re_string_skip_bytes (pstr,
 904                             re_string_char_size_at (pstr, pstr->cur_idx));
 905       return ch;
 906     }
 907 #endif
 908
 909   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 910 }
 911
 912 static void
 913 re_string_destruct (re_string_t *pstr)
 914 {
 915 #ifdef RE_ENABLE_I18N
 916   re_free (pstr->wcs);
 917   re_free (pstr->offsets);
 918 #endif /* RE_ENABLE_I18N  */
 919   if (pstr->mbs_allocated)
 920     re_free (pstr->mbs);
 921 }
 922
 923 /* Return the context at IDX in INPUT.  */
 924
 925 static unsigned int
 926 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 927 {
 928   int c;
 929   if (__glibc_unlikely (idx < 0))
 930     /* In this case, we use the value stored in input->tip_context,
 931        since we can't know the character in input->mbs[-1] here.  */
 932     return input->tip_context;
 933   if (__glibc_unlikely (idx == input->len))
 934     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 935             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 936 #ifdef RE_ENABLE_I18N
 937   if (input->mb_cur_max > 1)
 938     {
 939       wint_t wc;
 940       Idx wc_idx = idx;
 941       while(input->wcs[wc_idx] == WEOF)
 942         {
 943           DEBUG_ASSERT (wc_idx >= 0);
 944           --wc_idx;
 945           if (wc_idx < 0)
 946             return input->tip_context;
 947         }
 948       wc = input->wcs[wc_idx];
 949       if (__glibc_unlikely (input->word_ops_used != 0)
 950           && IS_WIDE_WORD_CHAR (wc))
 951         return CONTEXT_WORD;
 952       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 953               ? CONTEXT_NEWLINE : 0);
 954     }
 955   else
 956 #endif
 957     {
 958       c = re_string_byte_at (input, idx);
 959       if (bitset_contain (input->word_char, c))
 960         return CONTEXT_WORD;
 961       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 962     }
 963 }
 964 \f
 965 /* Functions for set operation.  */
 966
 967 static reg_errcode_t
 968 __attribute_warn_unused_result__
 969 re_node_set_alloc (re_node_set *set, Idx size)
 970 {
 971   set->alloc = size;
 972   set->nelem = 0;
 973   set->elems = re_malloc (Idx, size);
 974   if (__glibc_unlikely (set->elems == NULL)
 975       && (MALLOC_0_IS_NONNULL || size != 0))
 976     return REG_ESPACE;
 977   return REG_NOERROR;
 978 }
 979
 980 static reg_errcode_t
 981 __attribute_warn_unused_result__
 982 re_node_set_init_1 (re_node_set *set, Idx elem)
 983 {
 984   set->alloc = 1;
 985   set->nelem = 1;
 986   set->elems = re_malloc (Idx, 1);
 987   if (__glibc_unlikely (set->elems == NULL))
 988     {
 989       set->alloc = set->nelem = 0;
 990       return REG_ESPACE;
 991     }
 992   set->elems[0] = elem;
 993   return REG_NOERROR;
 994 }
 995
 996 static reg_errcode_t
 997 __attribute_warn_unused_result__
 998 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 999 {
1000   set->alloc = 2;
1001   set->elems = re_malloc (Idx, 2);
1002   if (__glibc_unlikely (set->elems == NULL))
1003     return REG_ESPACE;
1004   if (elem1 == elem2)
1005     {
1006       set->nelem = 1;
1007       set->elems[0] = elem1;
1008     }
1009   else
1010     {
1011       set->nelem = 2;
1012       if (elem1 < elem2)
1013         {
1014           set->elems[0] = elem1;
1015           set->elems[1] = elem2;
1016         }
1017       else
1018         {
1019           set->elems[0] = elem2;
1020           set->elems[1] = elem1;
1021         }
1022     }
1023   return REG_NOERROR;
1024 }
1025
1026 static reg_errcode_t
1027 __attribute_warn_unused_result__
1028 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1029 {
1030   dest->nelem = src->nelem;
1031   if (src->nelem > 0)
1032     {
1033       dest->alloc = dest->nelem;
1034       dest->elems = re_malloc (Idx, dest->alloc);
1035       if (__glibc_unlikely (dest->elems == NULL))
1036         {
1037           dest->alloc = dest->nelem = 0;
1038           return REG_ESPACE;
1039         }
1040       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1041     }
1042   else
1043     re_node_set_init_empty (dest);
1044   return REG_NOERROR;
1045 }
1046
1047 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1048    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1049    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1050
1051 static reg_errcode_t
1052 __attribute_warn_unused_result__
1053 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1054                            const re_node_set *src2)
1055 {
1056   Idx i1, i2, is, id, delta, sbase;
1057   if (src1->nelem == 0 || src2->nelem == 0)
1058     return REG_NOERROR;
1059
1060   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1061      conservative estimate.  */
1062   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1063     {
1064       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1065       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1066       if (__glibc_unlikely (new_elems == NULL))
1067         return REG_ESPACE;
1068       dest->elems = new_elems;
1069       dest->alloc = new_alloc;
1070     }
1071
1072   /* Find the items in the intersection of SRC1 and SRC2, and copy
1073      into the top of DEST those that are not already in DEST itself.  */
1074   sbase = dest->nelem + src1->nelem + src2->nelem;
1075   i1 = src1->nelem - 1;
1076   i2 = src2->nelem - 1;
1077   id = dest->nelem - 1;
1078   for (;;)
1079     {
1080       if (src1->elems[i1] == src2->elems[i2])
1081         {
1082           /* Try to find the item in DEST.  Maybe we could binary search?  */
1083           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1084             --id;
1085
1086           if (id < 0 || dest->elems[id] != src1->elems[i1])
1087             dest->elems[--sbase] = src1->elems[i1];
1088
1089           if (--i1 < 0 || --i2 < 0)
1090             break;
1091         }
1092
1093       /* Lower the highest of the two items.  */
1094       else if (src1->elems[i1] < src2->elems[i2])
1095         {
1096           if (--i2 < 0)
1097             break;
1098         }
1099       else
1100         {
1101           if (--i1 < 0)
1102             break;
1103         }
1104     }
1105
1106   id = dest->nelem - 1;
1107   is = dest->nelem + src1->nelem + src2->nelem - 1;
1108   delta = is - sbase + 1;
1109
1110   /* Now copy.  When DELTA becomes zero, the remaining
1111      DEST elements are already in place; this is more or
1112      less the same loop that is in re_node_set_merge.  */
1113   dest->nelem += delta;
1114   if (delta > 0 && id >= 0)
1115     for (;;)
1116       {
1117         if (dest->elems[is] > dest->elems[id])
1118           {
1119             /* Copy from the top.  */
1120             dest->elems[id + delta--] = dest->elems[is--];
1121             if (delta == 0)
1122               break;
1123           }
1124         else
1125           {
1126             /* Slide from the bottom.  */
1127             dest->elems[id + delta] = dest->elems[id];
1128             if (--id < 0)
1129               break;
1130           }
1131       }
1132
1133   /* Copy remaining SRC elements.  */
1134   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1135
1136   return REG_NOERROR;
1137 }
1138
1139 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1140    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1141
1142 static reg_errcode_t
1143 __attribute_warn_unused_result__
1144 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1145                         const re_node_set *src2)
1146 {
1147   Idx i1, i2, id;
1148   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1149     {
1150       dest->alloc = src1->nelem + src2->nelem;
1151       dest->elems = re_malloc (Idx, dest->alloc);
1152       if (__glibc_unlikely (dest->elems == NULL))
1153         return REG_ESPACE;
1154     }
1155   else
1156     {
1157       if (src1 != NULL && src1->nelem > 0)
1158         return re_node_set_init_copy (dest, src1);
1159       else if (src2 != NULL && src2->nelem > 0)
1160         return re_node_set_init_copy (dest, src2);
1161       else
1162         re_node_set_init_empty (dest);
1163       return REG_NOERROR;
1164     }
1165   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1166     {
1167       if (src1->elems[i1] > src2->elems[i2])
1168         {
1169           dest->elems[id++] = src2->elems[i2++];
1170           continue;
1171         }
1172       if (src1->elems[i1] == src2->elems[i2])
1173         ++i2;
1174       dest->elems[id++] = src1->elems[i1++];
1175     }
1176   if (i1 < src1->nelem)
1177     {
1178       memcpy (dest->elems + id, src1->elems + i1,
1179              (src1->nelem - i1) * sizeof (Idx));
1180       id += src1->nelem - i1;
1181     }
1182   else if (i2 < src2->nelem)
1183     {
1184       memcpy (dest->elems + id, src2->elems + i2,
1185              (src2->nelem - i2) * sizeof (Idx));
1186       id += src2->nelem - i2;
1187     }
1188   dest->nelem = id;
1189   return REG_NOERROR;
1190 }
1191
1192 /* Calculate the union set of the sets DEST and SRC. And store it to
1193    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1194
1195 static reg_errcode_t
1196 __attribute_warn_unused_result__
1197 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1198 {
1199   Idx is, id, sbase, delta;
1200   if (src == NULL || src->nelem == 0)
1201     return REG_NOERROR;
1202   if (dest->alloc < 2 * src->nelem + dest->nelem)
1203     {
1204       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1205       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1206       if (__glibc_unlikely (new_buffer == NULL))
1207         return REG_ESPACE;
1208       dest->elems = new_buffer;
1209       dest->alloc = new_alloc;
1210     }
1211
1212   if (__glibc_unlikely (dest->nelem == 0))
1213     {
1214       dest->nelem = src->nelem;
1215       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1216       return REG_NOERROR;
1217     }
1218
1219   /* Copy into the top of DEST the items of SRC that are not
1220      found in DEST.  Maybe we could binary search in DEST?  */
1221   for (sbase = dest->nelem + 2 * src->nelem,
1222        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1223     {
1224       if (dest->elems[id] == src->elems[is])
1225         is--, id--;
1226       else if (dest->elems[id] < src->elems[is])
1227         dest->elems[--sbase] = src->elems[is--];
1228       else /* if (dest->elems[id] > src->elems[is]) */
1229         --id;
1230     }
1231
1232   if (is >= 0)
1233     {
1234       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1235       sbase -= is + 1;
1236       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1237     }
1238
1239   id = dest->nelem - 1;
1240   is = dest->nelem + 2 * src->nelem - 1;
1241   delta = is - sbase + 1;
1242   if (delta == 0)
1243     return REG_NOERROR;
1244
1245   /* Now copy.  When DELTA becomes zero, the remaining
1246      DEST elements are already in place.  */
1247   dest->nelem += delta;
1248   for (;;)
1249     {
1250       if (dest->elems[is] > dest->elems[id])
1251         {
1252           /* Copy from the top.  */
1253           dest->elems[id + delta--] = dest->elems[is--];
1254           if (delta == 0)
1255             break;
1256         }
1257       else
1258         {
1259           /* Slide from the bottom.  */
1260           dest->elems[id + delta] = dest->elems[id];
1261           if (--id < 0)
1262             {
1263               /* Copy remaining SRC elements.  */
1264               memcpy (dest->elems, dest->elems + sbase,
1265                       delta * sizeof (Idx));
1266               break;
1267             }
1268         }
1269     }
1270
1271   return REG_NOERROR;
1272 }
1273
1274 /* Insert the new element ELEM to the re_node_set* SET.
1275    SET should not already have ELEM.
1276    Return true if successful.  */
1277
1278 static bool
1279 __attribute_warn_unused_result__
1280 re_node_set_insert (re_node_set *set, Idx elem)
1281 {
1282   Idx idx;
1283   /* In case the set is empty.  */
1284   if (set->alloc == 0)
1285     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1286
1287   if (__glibc_unlikely (set->nelem) == 0)
1288     {
1289       /* We already guaranteed above that set->alloc != 0.  */
1290       set->elems[0] = elem;
1291       ++set->nelem;
1292       return true;
1293     }
1294
1295   /* Realloc if we need.  */
1296   if (set->alloc == set->nelem)
1297     {
1298       Idx *new_elems;
1299       set->alloc = set->alloc * 2;
1300       new_elems = re_realloc (set->elems, Idx, set->alloc);
1301       if (__glibc_unlikely (new_elems == NULL))
1302         return false;
1303       set->elems = new_elems;
1304     }
1305
1306   /* Move the elements which follows the new element.  Test the
1307      first element separately to skip a check in the inner loop.  */
1308   if (elem < set->elems[0])
1309     {
1310       for (idx = set->nelem; idx > 0; idx--)
1311         set->elems[idx] = set->elems[idx - 1];
1312     }
1313   else
1314     {
1315       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1316         set->elems[idx] = set->elems[idx - 1];
1317     }
1318
1319   /* Insert the new element.  */
1320   set->elems[idx] = elem;
1321   ++set->nelem;
1322   return true;
1323 }
1324
1325 /* Insert the new element ELEM to the re_node_set* SET.
1326    SET should not already have any element greater than or equal to ELEM.
1327    Return true if successful.  */
1328
1329 static bool
1330 __attribute_warn_unused_result__
1331 re_node_set_insert_last (re_node_set *set, Idx elem)
1332 {
1333   /* Realloc if we need.  */
1334   if (set->alloc == set->nelem)
1335     {
1336       Idx *new_elems;
1337       set->alloc = (set->alloc + 1) * 2;
1338       new_elems = re_realloc (set->elems, Idx, set->alloc);
1339       if (__glibc_unlikely (new_elems == NULL))
1340         return false;
1341       set->elems = new_elems;
1342     }
1343
1344   /* Insert the new element.  */
1345   set->elems[set->nelem++] = elem;
1346   return true;
1347 }
1348
1349 /* Compare two node sets SET1 and SET2.
1350    Return true if SET1 and SET2 are equivalent.  */
1351
1352 static bool
1353 __attribute__ ((pure))
1354 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1355 {
1356   Idx i;
1357   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1358     return false;
1359   for (i = set1->nelem ; --i >= 0 ; )
1360     if (set1->elems[i] != set2->elems[i])
1361       return false;
1362   return true;
1363 }
1364
1365 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1366
1367 static Idx
1368 __attribute__ ((pure))
1369 re_node_set_contains (const re_node_set *set, Idx elem)
1370 {
1371   __re_size_t idx, right, mid;
1372   if (set->nelem <= 0)
1373     return 0;
1374
1375   /* Binary search the element.  */
1376   idx = 0;
1377   right = set->nelem - 1;
1378   while (idx < right)
1379     {
1380       mid = (idx + right) / 2;
1381       if (set->elems[mid] < elem)
1382         idx = mid + 1;
1383       else
1384         right = mid;
1385     }
1386   return set->elems[idx] == elem ? idx + 1 : 0;
1387 }
1388
1389 static void
1390 re_node_set_remove_at (re_node_set *set, Idx idx)
1391 {
1392   if (idx < 0 || idx >= set->nelem)
1393     return;
1394   --set->nelem;
1395   for (; idx < set->nelem; idx++)
1396     set->elems[idx] = set->elems[idx + 1];
1397 }
1398 \f
1399
1400 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1401    Or return -1 if an error occurred.  */
1402
1403 static Idx
1404 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1405 {
1406   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1407     {
1408       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1409       Idx *new_nexts, *new_indices;
1410       re_node_set *new_edests, *new_eclosures;
1411       re_token_t *new_nodes;
1412
1413       /* Avoid overflows in realloc.  */
1414       const size_t max_object_size = MAX (sizeof (re_token_t),
1415                                           MAX (sizeof (re_node_set),
1416                                                sizeof (Idx)));
1417       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1418                             < new_nodes_alloc))
1419         return -1;
1420
1421       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1422       if (__glibc_unlikely (new_nodes == NULL))
1423         return -1;
1424       dfa->nodes = new_nodes;
1425       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1426       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1427       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1428       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1429       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1430                             || new_edests == NULL || new_eclosures == NULL))
1431         {
1432            re_free (new_nexts);
1433            re_free (new_indices);
1434            re_free (new_edests);
1435            re_free (new_eclosures);
1436            return -1;
1437         }
1438       dfa->nexts = new_nexts;
1439       dfa->org_indices = new_indices;
1440       dfa->edests = new_edests;
1441       dfa->eclosures = new_eclosures;
1442       dfa->nodes_alloc = new_nodes_alloc;
1443     }
1444   dfa->nodes[dfa->nodes_len] = token;
1445   dfa->nodes[dfa->nodes_len].constraint = 0;
1446 #ifdef RE_ENABLE_I18N
1447   dfa->nodes[dfa->nodes_len].accept_mb =
1448     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1449      || token.type == COMPLEX_BRACKET);
1450 #endif
1451   dfa->nexts[dfa->nodes_len] = -1;
1452   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1453   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1454   return dfa->nodes_len++;
1455 }
1456
1457 static re_hashval_t
1458 calc_state_hash (const re_node_set *nodes, unsigned int context)
1459 {
1460   re_hashval_t hash = nodes->nelem + context;
1461   Idx i;
1462   for (i = 0 ; i < nodes->nelem ; i++)
1463     hash += nodes->elems[i];
1464   return hash;
1465 }
1466
1467 /* Search for the state whose node_set is equivalent to NODES.
1468    Return the pointer to the state, if we found it in the DFA.
1469    Otherwise create the new one and return it.  In case of an error
1470    return NULL and set the error code in ERR.
1471    Note: - We assume NULL as the invalid state, then it is possible that
1472            return value is NULL and ERR is REG_NOERROR.
1473          - We never return non-NULL value in case of any errors, it is for
1474            optimization.  */
1475
1476 static re_dfastate_t *
1477 __attribute_warn_unused_result__
1478 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1479                   const re_node_set *nodes)
1480 {
1481   re_hashval_t hash;
1482   re_dfastate_t *new_state;
1483   struct re_state_table_entry *spot;
1484   Idx i;
1485 #if defined GCC_LINT || defined lint
1486   /* Suppress bogus uninitialized-variable warnings.  */
1487   *err = REG_NOERROR;
1488 #endif
1489   if (__glibc_unlikely (nodes->nelem == 0))
1490     {
1491       *err = REG_NOERROR;
1492       return NULL;
1493     }
1494   hash = calc_state_hash (nodes, 0);
1495   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1496
1497   for (i = 0 ; i < spot->num ; i++)
1498     {
1499       re_dfastate_t *state = spot->array[i];
1500       if (hash != state->hash)
1501         continue;
1502       if (re_node_set_compare (&state->nodes, nodes))
1503         return state;
1504     }
1505
1506   /* There are no appropriate state in the dfa, create the new one.  */
1507   new_state = create_ci_newstate (dfa, nodes, hash);
1508   if (__glibc_unlikely (new_state == NULL))
1509     *err = REG_ESPACE;
1510
1511   return new_state;
1512 }
1513
1514 /* Search for the state whose node_set is equivalent to NODES and
1515    whose context is equivalent to CONTEXT.
1516    Return the pointer to the state, if we found it in the DFA.
1517    Otherwise create the new one and return it.  In case of an error
1518    return NULL and set the error code in ERR.
1519    Note: - We assume NULL as the invalid state, then it is possible that
1520            return value is NULL and ERR is REG_NOERROR.
1521          - We never return non-NULL value in case of any errors, it is for
1522            optimization.  */
1523
1524 static re_dfastate_t *
1525 __attribute_warn_unused_result__
1526 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1527                           const re_node_set *nodes, unsigned int context)
1528 {
1529   re_hashval_t hash;
1530   re_dfastate_t *new_state;
1531   struct re_state_table_entry *spot;
1532   Idx i;
1533 #if defined GCC_LINT || defined lint
1534   /* Suppress bogus uninitialized-variable warnings.  */
1535   *err = REG_NOERROR;
1536 #endif
1537   if (nodes->nelem == 0)
1538     {
1539       *err = REG_NOERROR;
1540       return NULL;
1541     }
1542   hash = calc_state_hash (nodes, context);
1543   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1544
1545   for (i = 0 ; i < spot->num ; i++)
1546     {
1547       re_dfastate_t *state = spot->array[i];
1548       if (state->hash == hash
1549           && state->context == context
1550           && re_node_set_compare (state->entrance_nodes, nodes))
1551         return state;
1552     }
1553   /* There are no appropriate state in 'dfa', create the new one.  */
1554   new_state = create_cd_newstate (dfa, nodes, context, hash);
1555   if (__glibc_unlikely (new_state == NULL))
1556     *err = REG_ESPACE;
1557
1558   return new_state;
1559 }
1560
1561 /* Finish initialization of the new state NEWSTATE, and using its hash value
1562    HASH put in the appropriate bucket of DFA's state table.  Return value
1563    indicates the error code if failed.  */
1564
1565 static reg_errcode_t
1566 __attribute_warn_unused_result__
1567 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1568                 re_hashval_t hash)
1569 {
1570   struct re_state_table_entry *spot;
1571   reg_errcode_t err;
1572   Idx i;
1573
1574   newstate->hash = hash;
1575   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1576   if (__glibc_unlikely (err != REG_NOERROR))
1577     return REG_ESPACE;
1578   for (i = 0; i < newstate->nodes.nelem; i++)
1579     {
1580       Idx elem = newstate->nodes.elems[i];
1581       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1582         if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1583           return REG_ESPACE;
1584     }
1585
1586   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1587   if (__glibc_unlikely (spot->alloc <= spot->num))
1588     {
1589       Idx new_alloc = 2 * spot->num + 2;
1590       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1591                                               new_alloc);
1592       if (__glibc_unlikely (new_array == NULL))
1593         return REG_ESPACE;
1594       spot->array = new_array;
1595       spot->alloc = new_alloc;
1596     }
1597   spot->array[spot->num++] = newstate;
1598   return REG_NOERROR;
1599 }
1600
1601 static void
1602 free_state (re_dfastate_t *state)
1603 {
1604   re_node_set_free (&state->non_eps_nodes);
1605   re_node_set_free (&state->inveclosure);
1606   if (state->entrance_nodes != &state->nodes)
1607     {
1608       re_node_set_free (state->entrance_nodes);
1609       re_free (state->entrance_nodes);
1610     }
1611   re_node_set_free (&state->nodes);
1612   re_free (state->word_trtable);
1613   re_free (state->trtable);
1614   re_free (state);
1615 }
1616
1617 /* Create the new state which is independent of contexts.
1618    Return the new state if succeeded, otherwise return NULL.  */
1619
1620 static re_dfastate_t *
1621 __attribute_warn_unused_result__
1622 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1623                     re_hashval_t hash)
1624 {
1625   Idx i;
1626   reg_errcode_t err;
1627   re_dfastate_t *newstate;
1628
1629   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1630   if (__glibc_unlikely (newstate == NULL))
1631     return NULL;
1632   err = re_node_set_init_copy (&newstate->nodes, nodes);
1633   if (__glibc_unlikely (err != REG_NOERROR))
1634     {
1635       re_free (newstate);
1636       return NULL;
1637     }
1638
1639   newstate->entrance_nodes = &newstate->nodes;
1640   for (i = 0 ; i < nodes->nelem ; i++)
1641     {
1642       re_token_t *node = dfa->nodes + nodes->elems[i];
1643       re_token_type_t type = node->type;
1644       if (type == CHARACTER && !node->constraint)
1645         continue;
1646 #ifdef RE_ENABLE_I18N
1647       newstate->accept_mb |= node->accept_mb;
1648 #endif /* RE_ENABLE_I18N */
1649
1650       /* If the state has the halt node, the state is a halt state.  */
1651       if (type == END_OF_RE)
1652         newstate->halt = 1;
1653       else if (type == OP_BACK_REF)
1654         newstate->has_backref = 1;
1655       else if (type == ANCHOR || node->constraint)
1656         newstate->has_constraint = 1;
1657     }
1658   err = register_state (dfa, newstate, hash);
1659   if (__glibc_unlikely (err != REG_NOERROR))
1660     {
1661       free_state (newstate);
1662       newstate = NULL;
1663     }
1664   return newstate;
1665 }
1666
1667 /* Create the new state which is depend on the context CONTEXT.
1668    Return the new state if succeeded, otherwise return NULL.  */
1669
1670 static re_dfastate_t *
1671 __attribute_warn_unused_result__
1672 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1673                     unsigned int context, re_hashval_t hash)
1674 {
1675   Idx i, nctx_nodes = 0;
1676   reg_errcode_t err;
1677   re_dfastate_t *newstate;
1678
1679   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1680   if (__glibc_unlikely (newstate == NULL))
1681     return NULL;
1682   err = re_node_set_init_copy (&newstate->nodes, nodes);
1683   if (__glibc_unlikely (err != REG_NOERROR))
1684     {
1685       re_free (newstate);
1686       return NULL;
1687     }
1688
1689   newstate->context = context;
1690   newstate->entrance_nodes = &newstate->nodes;
1691
1692   for (i = 0 ; i < nodes->nelem ; i++)
1693     {
1694       re_token_t *node = dfa->nodes + nodes->elems[i];
1695       re_token_type_t type = node->type;
1696       unsigned int constraint = node->constraint;
1697
1698       if (type == CHARACTER && !constraint)
1699         continue;
1700 #ifdef RE_ENABLE_I18N
1701       newstate->accept_mb |= node->accept_mb;
1702 #endif /* RE_ENABLE_I18N */
1703
1704       /* If the state has the halt node, the state is a halt state.  */
1705       if (type == END_OF_RE)
1706         newstate->halt = 1;
1707       else if (type == OP_BACK_REF)
1708         newstate->has_backref = 1;
1709
1710       if (constraint)
1711         {
1712           if (newstate->entrance_nodes == &newstate->nodes)
1713             {
1714               re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1715               if (__glibc_unlikely (entrance_nodes == NULL))
1716                 {
1717                   free_state (newstate);
1718                   return NULL;
1719                 }
1720               newstate->entrance_nodes = entrance_nodes;
1721               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1722                   != REG_NOERROR)
1723                 {
1724                   free_state (newstate);
1725                   return NULL;
1726                 }
1727               nctx_nodes = 0;
1728               newstate->has_constraint = 1;
1729             }
1730
1731           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1732             {
1733               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1734               ++nctx_nodes;
1735             }
1736         }
1737     }
1738   err = register_state (dfa, newstate, hash);
1739   if (__glibc_unlikely (err != REG_NOERROR))
1740     {
1741       free_state (newstate);
1742       newstate = NULL;
1743     }
1744   return  newstate;
1745 }