posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static void re_string_construct_common (const char *str, int len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, int icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  26                                           const re_node_set *nodes,
  27                                           unsigned int hash) internal_function;
  28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  29                                           const re_node_set *nodes,
  30                                           unsigned int context,
  31                                           unsigned int hash) internal_function;
  32 \f
  33 /* Functions for string operation.  */
  34
  35 /* This function allocate the buffers.  It is necessary to call
  36    re_string_reconstruct before using the object.  */
  37
  38 static reg_errcode_t
  39 internal_function
  40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  41                     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  42 {
  43   reg_errcode_t ret;
  44   int init_buf_len;
  45
  46   /* Ensure at least one character fits into the buffers.  */
  47   if (init_len < dfa->mb_cur_max)
  48     init_len = dfa->mb_cur_max;
  49   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  50   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  51
  52   ret = re_string_realloc_buffers (pstr, init_buf_len);
  53   if (BE (ret != REG_NOERROR, 0))
  54     return ret;
  55
  56   pstr->word_char = dfa->word_char;
  57   pstr->word_ops_used = dfa->word_ops_used;
  58   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  59   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  60   pstr->valid_raw_len = pstr->valid_len;
  61   return REG_NOERROR;
  62 }
  63
  64 /* This function allocate the buffers, and initialize them.  */
  65
  66 static reg_errcode_t
  67 internal_function
  68 re_string_construct (re_string_t *pstr, const char *str, int len,
  69                      RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  70 {
  71   reg_errcode_t ret;
  72   memset (pstr, '\0', sizeof (re_string_t));
  73   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  74
  75   if (len > 0)
  76     {
  77       ret = re_string_realloc_buffers (pstr, len + 1);
  78       if (BE (ret != REG_NOERROR, 0))
  79         return ret;
  80     }
  81   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  82
  83   if (icase)
  84     {
  85 #ifdef RE_ENABLE_I18N
  86       if (dfa->mb_cur_max > 1)
  87         {
  88           while (1)
  89             {
  90               ret = build_wcs_upper_buffer (pstr);
  91               if (BE (ret != REG_NOERROR, 0))
  92                 return ret;
  93               if (pstr->valid_raw_len >= len)
  94                 break;
  95               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  96                 break;
  97               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  98               if (BE (ret != REG_NOERROR, 0))
  99                 return ret;
 100             }
 101         }
 102       else
 103 #endif /* RE_ENABLE_I18N  */
 104         build_upper_buffer (pstr);
 105     }
 106   else
 107     {
 108 #ifdef RE_ENABLE_I18N
 109       if (dfa->mb_cur_max > 1)
 110         build_wcs_buffer (pstr);
 111       else
 112 #endif /* RE_ENABLE_I18N  */
 113         {
 114           if (trans != NULL)
 115             re_string_translate_buffer (pstr);
 116           else
 117             {
 118               pstr->valid_len = pstr->bufs_len;
 119               pstr->valid_raw_len = pstr->bufs_len;
 120             }
 121         }
 122     }
 123
 124   return REG_NOERROR;
 125 }
 126
 127 /* Helper functions for re_string_allocate, and re_string_construct.  */
 128
 129 static reg_errcode_t
 130 internal_function
 131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
 132 {
 133 #ifdef RE_ENABLE_I18N
 134   if (pstr->mb_cur_max > 1)
 135     {
 136       wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 137       if (BE (new_wcs == NULL, 0))
 138         return REG_ESPACE;
 139       pstr->wcs = new_wcs;
 140       if (pstr->offsets != NULL)
 141         {
 142           int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
 143           if (BE (new_offsets == NULL, 0))
 144             return REG_ESPACE;
 145           pstr->offsets = new_offsets;
 146         }
 147     }
 148 #endif /* RE_ENABLE_I18N  */
 149   if (pstr->mbs_allocated)
 150     {
 151       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 152                                            new_buf_len);
 153       if (BE (new_mbs == NULL, 0))
 154         return REG_ESPACE;
 155       pstr->mbs = new_mbs;
 156     }
 157   pstr->bufs_len = new_buf_len;
 158   return REG_NOERROR;
 159 }
 160
 161
 162 static void
 163 internal_function
 164 re_string_construct_common (const char *str, int len, re_string_t *pstr,
 165                             RE_TRANSLATE_TYPE trans, int icase,
 166                             const re_dfa_t *dfa)
 167 {
 168   pstr->raw_mbs = (const unsigned char *) str;
 169   pstr->len = len;
 170   pstr->raw_len = len;
 171   pstr->trans = trans;
 172   pstr->icase = icase ? 1 : 0;
 173   pstr->mbs_allocated = (trans != NULL || icase);
 174   pstr->mb_cur_max = dfa->mb_cur_max;
 175   pstr->is_utf8 = dfa->is_utf8;
 176   pstr->map_notascii = dfa->map_notascii;
 177   pstr->stop = pstr->len;
 178   pstr->raw_stop = pstr->stop;
 179 }
 180
 181 #ifdef RE_ENABLE_I18N
 182
 183 /* Build wide character buffer PSTR->WCS.
 184    If the byte sequence of the string are:
 185      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 186    Then wide character buffer will be:
 187      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 188    We use WEOF for padding, they indicate that the position isn't
 189    a first byte of a multibyte character.
 190
 191    Note that this function assumes PSTR->VALID_LEN elements are already
 192    built and starts from PSTR->VALID_LEN.  */
 193
 194 static void
 195 internal_function
 196 build_wcs_buffer (re_string_t *pstr)
 197 {
 198 #ifdef _LIBC
 199   unsigned char buf[MB_LEN_MAX];
 200   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 201 #else
 202   unsigned char buf[64];
 203 #endif
 204   mbstate_t prev_st;
 205   int byte_idx, end_idx, remain_len;
 206   size_t mbclen;
 207
 208   /* Build the buffers from pstr->valid_len to either pstr->len or
 209      pstr->bufs_len.  */
 210   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 211   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 212     {
 213       wchar_t wc;
 214       const char *p;
 215
 216       remain_len = end_idx - byte_idx;
 217       prev_st = pstr->cur_state;
 218       /* Apply the translation if we need.  */
 219       if (BE (pstr->trans != NULL, 0))
 220         {
 221           int i, ch;
 222
 223           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 224             {
 225               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 226               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 227             }
 228           p = (const char *) buf;
 229         }
 230       else
 231         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 232       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 233       if (BE (mbclen == (size_t) -2, 0))
 234         {
 235           /* The buffer doesn't have enough space, finish to build.  */
 236           pstr->cur_state = prev_st;
 237           break;
 238         }
 239       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 240         {
 241           /* We treat these cases as a singlebyte character.  */
 242           mbclen = 1;
 243           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 244           if (BE (pstr->trans != NULL, 0))
 245             wc = pstr->trans[wc];
 246           pstr->cur_state = prev_st;
 247         }
 248
 249       /* Write wide character and padding.  */
 250       pstr->wcs[byte_idx++] = wc;
 251       /* Write paddings.  */
 252       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 253         pstr->wcs[byte_idx++] = WEOF;
 254     }
 255   pstr->valid_len = byte_idx;
 256   pstr->valid_raw_len = byte_idx;
 257 }
 258
 259 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 260    but for REG_ICASE.  */
 261
 262 static reg_errcode_t
 263 internal_function
 264 build_wcs_upper_buffer (re_string_t *pstr)
 265 {
 266   mbstate_t prev_st;
 267   int src_idx, byte_idx, end_idx, remain_len;
 268   size_t mbclen;
 269 #ifdef _LIBC
 270   char buf[MB_LEN_MAX];
 271   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 272 #else
 273   char buf[64];
 274 #endif
 275
 276   byte_idx = pstr->valid_len;
 277   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 278
 279   /* The following optimization assumes that ASCII characters can be
 280      mapped to wide characters with a simple cast.  */
 281   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 282     {
 283       while (byte_idx < end_idx)
 284         {
 285           wchar_t wc;
 286
 287           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 288               && mbsinit (&pstr->cur_state))
 289             {
 290               /* In case of a singlebyte character.  */
 291               pstr->mbs[byte_idx]
 292                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 293               /* The next step uses the assumption that wchar_t is encoded
 294                  ASCII-safe: all ASCII values can be converted like this.  */
 295               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 296               ++byte_idx;
 297               continue;
 298             }
 299
 300           remain_len = end_idx - byte_idx;
 301           prev_st = pstr->cur_state;
 302           mbclen = mbrtowc (&wc,
 303                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 304                              + byte_idx), remain_len, &pstr->cur_state);
 305           if (BE (mbclen + 2 > 2, 1))
 306             {
 307               wchar_t wcu = wc;
 308               if (iswlower (wc))
 309                 {
 310                   size_t mbcdlen;
 311
 312                   wcu = towupper (wc);
 313                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 314                   if (BE (mbclen == mbcdlen, 1))
 315                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 316                   else
 317                     {
 318                       src_idx = byte_idx;
 319                       goto offsets_needed;
 320                     }
 321                 }
 322               else
 323                 memcpy (pstr->mbs + byte_idx,
 324                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 325               pstr->wcs[byte_idx++] = wcu;
 326               /* Write paddings.  */
 327               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 328                 pstr->wcs[byte_idx++] = WEOF;
 329             }
 330           else if (mbclen == (size_t) -1 || mbclen == 0)
 331             {
 332               /* It is an invalid character or '\0'.  Just use the byte.  */
 333               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 334               pstr->mbs[byte_idx] = ch;
 335               /* And also cast it to wide char.  */
 336               pstr->wcs[byte_idx++] = (wchar_t) ch;
 337               if (BE (mbclen == (size_t) -1, 0))
 338                 pstr->cur_state = prev_st;
 339             }
 340           else
 341             {
 342               /* The buffer doesn't have enough space, finish to build.  */
 343               pstr->cur_state = prev_st;
 344               break;
 345             }
 346         }
 347       pstr->valid_len = byte_idx;
 348       pstr->valid_raw_len = byte_idx;
 349       return REG_NOERROR;
 350     }
 351   else
 352     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 353       {
 354         wchar_t wc;
 355         const char *p;
 356       offsets_needed:
 357         remain_len = end_idx - byte_idx;
 358         prev_st = pstr->cur_state;
 359         if (BE (pstr->trans != NULL, 0))
 360           {
 361             int i, ch;
 362
 363             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 364               {
 365                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 366                 buf[i] = pstr->trans[ch];
 367               }
 368             p = (const char *) buf;
 369           }
 370         else
 371           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 372         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 373         if (BE (mbclen + 2 > 2, 1))
 374           {
 375             wchar_t wcu = wc;
 376             if (iswlower (wc))
 377               {
 378                 size_t mbcdlen;
 379
 380                 wcu = towupper (wc);
 381                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 382                 if (BE (mbclen == mbcdlen, 1))
 383                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 384                 else if (mbcdlen != (size_t) -1)
 385                   {
 386                     size_t i;
 387
 388                     if (byte_idx + mbcdlen > pstr->bufs_len)
 389                       {
 390                         pstr->cur_state = prev_st;
 391                         break;
 392                       }
 393
 394                     if (pstr->offsets == NULL)
 395                       {
 396                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 397
 398                         if (pstr->offsets == NULL)
 399                           return REG_ESPACE;
 400                       }
 401                     if (!pstr->offsets_needed)
 402                       {
 403                         for (i = 0; i < (size_t) byte_idx; ++i)
 404                           pstr->offsets[i] = i;
 405                         pstr->offsets_needed = 1;
 406                       }
 407
 408                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 409                     pstr->wcs[byte_idx] = wcu;
 410                     pstr->offsets[byte_idx] = src_idx;
 411                     for (i = 1; i < mbcdlen; ++i)
 412                       {
 413                         pstr->offsets[byte_idx + i]
 414                           = src_idx + (i < mbclen ? i : mbclen - 1);
 415                         pstr->wcs[byte_idx + i] = WEOF;
 416                       }
 417                     pstr->len += mbcdlen - mbclen;
 418                     if (pstr->raw_stop > src_idx)
 419                       pstr->stop += mbcdlen - mbclen;
 420                     end_idx = (pstr->bufs_len > pstr->len)
 421                               ? pstr->len : pstr->bufs_len;
 422                     byte_idx += mbcdlen;
 423                     src_idx += mbclen;
 424                     continue;
 425                   }
 426                 else
 427                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 428               }
 429             else
 430               memcpy (pstr->mbs + byte_idx, p, mbclen);
 431
 432             if (BE (pstr->offsets_needed != 0, 0))
 433               {
 434                 size_t i;
 435                 for (i = 0; i < mbclen; ++i)
 436                   pstr->offsets[byte_idx + i] = src_idx + i;
 437               }
 438             src_idx += mbclen;
 439
 440             pstr->wcs[byte_idx++] = wcu;
 441             /* Write paddings.  */
 442             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 443               pstr->wcs[byte_idx++] = WEOF;
 444           }
 445         else if (mbclen == (size_t) -1 || mbclen == 0)
 446           {
 447             /* It is an invalid character or '\0'.  Just use the byte.  */
 448             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 449
 450             if (BE (pstr->trans != NULL, 0))
 451               ch = pstr->trans [ch];
 452             pstr->mbs[byte_idx] = ch;
 453
 454             if (BE (pstr->offsets_needed != 0, 0))
 455               pstr->offsets[byte_idx] = src_idx;
 456             ++src_idx;
 457
 458             /* And also cast it to wide char.  */
 459             pstr->wcs[byte_idx++] = (wchar_t) ch;
 460             if (BE (mbclen == (size_t) -1, 0))
 461               pstr->cur_state = prev_st;
 462           }
 463         else
 464           {
 465             /* The buffer doesn't have enough space, finish to build.  */
 466             pstr->cur_state = prev_st;
 467             break;
 468           }
 469       }
 470   pstr->valid_len = byte_idx;
 471   pstr->valid_raw_len = src_idx;
 472   return REG_NOERROR;
 473 }
 474
 475 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 476    Return the index.  */
 477
 478 static int
 479 internal_function
 480 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
 481 {
 482   mbstate_t prev_st;
 483   int rawbuf_idx;
 484   size_t mbclen;
 485   wchar_t wc = WEOF;
 486
 487   /* Skip the characters which are not necessary to check.  */
 488   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 489        rawbuf_idx < new_raw_idx;)
 490     {
 491       int remain_len;
 492       remain_len = pstr->len - rawbuf_idx;
 493       prev_st = pstr->cur_state;
 494       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 495                         remain_len, &pstr->cur_state);
 496       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 497         {
 498           /* We treat these cases as a single byte character.  */
 499           if (mbclen == 0 || remain_len == 0)
 500             wc = L'\0';
 501           else
 502             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 503           mbclen = 1;
 504           pstr->cur_state = prev_st;
 505         }
 506       /* Then proceed the next character.  */
 507       rawbuf_idx += mbclen;
 508     }
 509   *last_wc = (wint_t) wc;
 510   return rawbuf_idx;
 511 }
 512 #endif /* RE_ENABLE_I18N  */
 513
 514 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 515    This function is used in case of REG_ICASE.  */
 516
 517 static void
 518 internal_function
 519 build_upper_buffer (re_string_t *pstr)
 520 {
 521   int char_idx, end_idx;
 522   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 523
 524   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 525     {
 526       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 527       if (BE (pstr->trans != NULL, 0))
 528         ch = pstr->trans[ch];
 529       if (islower (ch))
 530         pstr->mbs[char_idx] = toupper (ch);
 531       else
 532         pstr->mbs[char_idx] = ch;
 533     }
 534   pstr->valid_len = char_idx;
 535   pstr->valid_raw_len = char_idx;
 536 }
 537
 538 /* Apply TRANS to the buffer in PSTR.  */
 539
 540 static void
 541 internal_function
 542 re_string_translate_buffer (re_string_t *pstr)
 543 {
 544   int buf_idx, end_idx;
 545   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 546
 547   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 548     {
 549       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 550       pstr->mbs[buf_idx] = pstr->trans[ch];
 551     }
 552
 553   pstr->valid_len = buf_idx;
 554   pstr->valid_raw_len = buf_idx;
 555 }
 556
 557 /* This function re-construct the buffers.
 558    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 559    convert to upper case in case of REG_ICASE, apply translation.  */
 560
 561 static reg_errcode_t
 562 internal_function
 563 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 564 {
 565   int offset = idx - pstr->raw_mbs_idx;
 566   if (BE (offset < 0, 0))
 567     {
 568       /* Reset buffer.  */
 569 #ifdef RE_ENABLE_I18N
 570       if (pstr->mb_cur_max > 1)
 571         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 572 #endif /* RE_ENABLE_I18N */
 573       pstr->len = pstr->raw_len;
 574       pstr->stop = pstr->raw_stop;
 575       pstr->valid_len = 0;
 576       pstr->raw_mbs_idx = 0;
 577       pstr->valid_raw_len = 0;
 578       pstr->offsets_needed = 0;
 579       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 580                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 581       if (!pstr->mbs_allocated)
 582         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 583       offset = idx;
 584     }
 585
 586   if (BE (offset != 0, 1))
 587     {
 588       /* Are the characters which are already checked remain?  */
 589       if (BE (offset < pstr->valid_raw_len, 1)
 590 #ifdef RE_ENABLE_I18N
 591           /* Handling this would enlarge the code too much.
 592              Accept a slowdown in that case.  */
 593           && pstr->offsets_needed == 0
 594 #endif
 595          )
 596         {
 597           /* Yes, move them to the front of the buffer.  */
 598           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 599 #ifdef RE_ENABLE_I18N
 600           if (pstr->mb_cur_max > 1)
 601             memmove (pstr->wcs, pstr->wcs + offset,
 602                      (pstr->valid_len - offset) * sizeof (wint_t));
 603 #endif /* RE_ENABLE_I18N */
 604           if (BE (pstr->mbs_allocated, 0))
 605             memmove (pstr->mbs, pstr->mbs + offset,
 606                      pstr->valid_len - offset);
 607           pstr->valid_len -= offset;
 608           pstr->valid_raw_len -= offset;
 609 #if DEBUG
 610           assert (pstr->valid_len > 0);
 611 #endif
 612         }
 613       else
 614         {
 615           /* No, skip all characters until IDX.  */
 616 #ifdef RE_ENABLE_I18N
 617           if (BE (pstr->offsets_needed, 0))
 618             {
 619               pstr->len = pstr->raw_len - idx + offset;
 620               pstr->stop = pstr->raw_stop - idx + offset;
 621               pstr->offsets_needed = 0;
 622             }
 623 #endif
 624           pstr->valid_len = 0;
 625 #ifdef RE_ENABLE_I18N
 626           if (pstr->mb_cur_max > 1)
 627             {
 628               int wcs_idx;
 629               wint_t wc = WEOF;
 630
 631               if (pstr->is_utf8)
 632                 {
 633                   const unsigned char *raw, *p, *q, *end;
 634
 635                   /* Special case UTF-8.  Multi-byte chars start with any
 636                      byte other than 0x80 - 0xbf.  */
 637                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 638                   end = raw + (offset - pstr->mb_cur_max);
 639                   p = raw + offset - 1;
 640 #ifdef _LIBC
 641                   /* We know the wchar_t encoding is UCS4, so for the simple
 642                      case, ASCII characters, skip the conversion step.  */
 643                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 644                     {
 645                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 646                       pstr->valid_len = 0;
 647                       wc = (wchar_t) *p;
 648                     }
 649                   else
 650 #endif
 651                     for (; p >= end; --p)
 652                       if ((*p & 0xc0) != 0x80)
 653                         {
 654                           mbstate_t cur_state;
 655                           wchar_t wc2;
 656                           int mlen = raw + pstr->len - p;
 657                           unsigned char buf[6];
 658                           size_t mbclen;
 659
 660                           q = p;
 661                           if (BE (pstr->trans != NULL, 0))
 662                             {
 663                               int i = mlen < 6 ? mlen : 6;
 664                               while (--i >= 0)
 665                                 buf[i] = pstr->trans[p[i]];
 666                               q = buf;
 667                             }
 668                           /* XXX Don't use mbrtowc, we know which conversion
 669                              to use (UTF-8 -> UCS4).  */
 670                           memset (&cur_state, 0, sizeof (cur_state));
 671                           mbclen = mbrtowc (&wc2, (const char *) p, mlen,
 672                                             &cur_state);
 673                           if (raw + offset - p <= mbclen
 674                               && mbclen < (size_t) -2)
 675                             {
 676                               memset (&pstr->cur_state, '\0',
 677                                       sizeof (mbstate_t));
 678                               pstr->valid_len = mbclen - (raw + offset - p);
 679                               wc = wc2;
 680                             }
 681                           break;
 682                         }
 683                 }
 684
 685               if (wc == WEOF)
 686                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 687               if (wc == WEOF)
 688                 pstr->tip_context
 689                   = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
 690               else
 691                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 692                                       && IS_WIDE_WORD_CHAR (wc))
 693                                      ? CONTEXT_WORD
 694                                      : ((IS_WIDE_NEWLINE (wc)
 695                                          && pstr->newline_anchor)
 696                                         ? CONTEXT_NEWLINE : 0));
 697               if (BE (pstr->valid_len, 0))
 698                 {
 699                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 700                     pstr->wcs[wcs_idx] = WEOF;
 701                   if (pstr->mbs_allocated)
 702                     memset (pstr->mbs, 255, pstr->valid_len);
 703                 }
 704               pstr->valid_raw_len = pstr->valid_len;
 705             }
 706           else
 707 #endif /* RE_ENABLE_I18N */
 708             {
 709               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 710               pstr->valid_raw_len = 0;
 711               if (pstr->trans)
 712                 c = pstr->trans[c];
 713               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 714                                    ? CONTEXT_WORD
 715                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 716                                       ? CONTEXT_NEWLINE : 0));
 717             }
 718         }
 719       if (!BE (pstr->mbs_allocated, 0))
 720         pstr->mbs += offset;
 721     }
 722   pstr->raw_mbs_idx = idx;
 723   pstr->len -= offset;
 724   pstr->stop -= offset;
 725
 726   /* Then build the buffers.  */
 727 #ifdef RE_ENABLE_I18N
 728   if (pstr->mb_cur_max > 1)
 729     {
 730       if (pstr->icase)
 731         {
 732           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 733           if (BE (ret != REG_NOERROR, 0))
 734             return ret;
 735         }
 736       else
 737         build_wcs_buffer (pstr);
 738     }
 739   else
 740 #endif /* RE_ENABLE_I18N */
 741     if (BE (pstr->mbs_allocated, 0))
 742       {
 743         if (pstr->icase)
 744           build_upper_buffer (pstr);
 745         else if (pstr->trans != NULL)
 746           re_string_translate_buffer (pstr);
 747       }
 748     else
 749       pstr->valid_len = pstr->len;
 750
 751   pstr->cur_idx = 0;
 752   return REG_NOERROR;
 753 }
 754
 755 static unsigned char
 756 internal_function __attribute ((pure))
 757 re_string_peek_byte_case (const re_string_t *pstr, int idx)
 758 {
 759   int ch, off;
 760
 761   /* Handle the common (easiest) cases first.  */
 762   if (BE (!pstr->mbs_allocated, 1))
 763     return re_string_peek_byte (pstr, idx);
 764
 765 #ifdef RE_ENABLE_I18N
 766   if (pstr->mb_cur_max > 1
 767       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 768     return re_string_peek_byte (pstr, idx);
 769 #endif
 770
 771   off = pstr->cur_idx + idx;
 772 #ifdef RE_ENABLE_I18N
 773   if (pstr->offsets_needed)
 774     off = pstr->offsets[off];
 775 #endif
 776
 777   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 778
 779 #ifdef RE_ENABLE_I18N
 780   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 781      this function returns CAPITAL LETTER I instead of first byte of
 782      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 783      since peek_byte_case doesn't advance cur_idx in any way.  */
 784   if (pstr->offsets_needed && !isascii (ch))
 785     return re_string_peek_byte (pstr, idx);
 786 #endif
 787
 788   return ch;
 789 }
 790
 791 static unsigned char
 792 internal_function __attribute ((pure))
 793 re_string_fetch_byte_case (re_string_t *pstr)
 794 {
 795   if (BE (!pstr->mbs_allocated, 1))
 796     return re_string_fetch_byte (pstr);
 797
 798 #ifdef RE_ENABLE_I18N
 799   if (pstr->offsets_needed)
 800     {
 801       int off, ch;
 802
 803       /* For tr_TR.UTF-8 [[:islower:]] there is
 804          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 805          in that case the whole multi-byte character and return
 806          the original letter.  On the other side, with
 807          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 808          anything else would complicate things too much.  */
 809
 810       if (!re_string_first_byte (pstr, pstr->cur_idx))
 811         return re_string_fetch_byte (pstr);
 812
 813       off = pstr->offsets[pstr->cur_idx];
 814       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 815
 816       if (! isascii (ch))
 817         return re_string_fetch_byte (pstr);
 818
 819       re_string_skip_bytes (pstr,
 820                             re_string_char_size_at (pstr, pstr->cur_idx));
 821       return ch;
 822     }
 823 #endif
 824
 825   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 826 }
 827
 828 static void
 829 internal_function
 830 re_string_destruct (re_string_t *pstr)
 831 {
 832 #ifdef RE_ENABLE_I18N
 833   re_free (pstr->wcs);
 834   re_free (pstr->offsets);
 835 #endif /* RE_ENABLE_I18N  */
 836   if (pstr->mbs_allocated)
 837     re_free (pstr->mbs);
 838 }
 839
 840 /* Return the context at IDX in INPUT.  */
 841
 842 static unsigned int
 843 internal_function
 844 re_string_context_at (const re_string_t *input, int idx, int eflags)
 845 {
 846   int c;
 847   if (BE (idx < 0, 0))
 848     /* In this case, we use the value stored in input->tip_context,
 849        since we can't know the character in input->mbs[-1] here.  */
 850     return input->tip_context;
 851   if (BE (idx == input->len, 0))
 852     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 853             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 854 #ifdef RE_ENABLE_I18N
 855   if (input->mb_cur_max > 1)
 856     {
 857       wint_t wc;
 858       int wc_idx = idx;
 859       while(input->wcs[wc_idx] == WEOF)
 860         {
 861 #ifdef DEBUG
 862           /* It must not happen.  */
 863           assert (wc_idx >= 0);
 864 #endif
 865           --wc_idx;
 866           if (wc_idx < 0)
 867             return input->tip_context;
 868         }
 869       wc = input->wcs[wc_idx];
 870       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 871         return CONTEXT_WORD;
 872       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 873               ? CONTEXT_NEWLINE : 0);
 874     }
 875   else
 876 #endif
 877     {
 878       c = re_string_byte_at (input, idx);
 879       if (bitset_contain (input->word_char, c))
 880         return CONTEXT_WORD;
 881       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 882     }
 883 }
 884 \f
 885 /* Functions for set operation.  */
 886
 887 static reg_errcode_t
 888 internal_function
 889 re_node_set_alloc (re_node_set *set, int size)
 890 {
 891   set->alloc = size;
 892   set->nelem = 0;
 893   set->elems = re_malloc (int, size);
 894   if (BE (set->elems == NULL, 0))
 895     return REG_ESPACE;
 896   return REG_NOERROR;
 897 }
 898
 899 static reg_errcode_t
 900 internal_function
 901 re_node_set_init_1 (re_node_set *set, int elem)
 902 {
 903   set->alloc = 1;
 904   set->nelem = 1;
 905   set->elems = re_malloc (int, 1);
 906   if (BE (set->elems == NULL, 0))
 907     {
 908       set->alloc = set->nelem = 0;
 909       return REG_ESPACE;
 910     }
 911   set->elems[0] = elem;
 912   return REG_NOERROR;
 913 }
 914
 915 static reg_errcode_t
 916 internal_function
 917 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
 918 {
 919   set->alloc = 2;
 920   set->elems = re_malloc (int, 2);
 921   if (BE (set->elems == NULL, 0))
 922     return REG_ESPACE;
 923   if (elem1 == elem2)
 924     {
 925       set->nelem = 1;
 926       set->elems[0] = elem1;
 927     }
 928   else
 929     {
 930       set->nelem = 2;
 931       if (elem1 < elem2)
 932         {
 933           set->elems[0] = elem1;
 934           set->elems[1] = elem2;
 935         }
 936       else
 937         {
 938           set->elems[0] = elem2;
 939           set->elems[1] = elem1;
 940         }
 941     }
 942   return REG_NOERROR;
 943 }
 944
 945 static reg_errcode_t
 946 internal_function
 947 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 948 {
 949   dest->nelem = src->nelem;
 950   if (src->nelem > 0)
 951     {
 952       dest->alloc = dest->nelem;
 953       dest->elems = re_malloc (int, dest->alloc);
 954       if (BE (dest->elems == NULL, 0))
 955         {
 956           dest->alloc = dest->nelem = 0;
 957           return REG_ESPACE;
 958         }
 959       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
 960     }
 961   else
 962     re_node_set_init_empty (dest);
 963   return REG_NOERROR;
 964 }
 965
 966 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 967    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 968    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 969
 970 static reg_errcode_t
 971 internal_function
 972 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
 973                            const re_node_set *src2)
 974 {
 975   int i1, i2, is, id, delta, sbase;
 976   if (src1->nelem == 0 || src2->nelem == 0)
 977     return REG_NOERROR;
 978
 979   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 980      conservative estimate.  */
 981   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 982     {
 983       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
 984       int *new_elems = re_realloc (dest->elems, int, new_alloc);
 985       if (BE (new_elems == NULL, 0))
 986         return REG_ESPACE;
 987       dest->elems = new_elems;
 988       dest->alloc = new_alloc;
 989     }
 990
 991   /* Find the items in the intersection of SRC1 and SRC2, and copy
 992      into the top of DEST those that are not already in DEST itself.  */
 993   sbase = dest->nelem + src1->nelem + src2->nelem;
 994   i1 = src1->nelem - 1;
 995   i2 = src2->nelem - 1;
 996   id = dest->nelem - 1;
 997   for (;;)
 998     {
 999       if (src1->elems[i1] == src2->elems[i2])
1000         {
1001           /* Try to find the item in DEST.  Maybe we could binary search?  */
1002           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1003             --id;
1004
1005           if (id < 0 || dest->elems[id] != src1->elems[i1])
1006             dest->elems[--sbase] = src1->elems[i1];
1007
1008           if (--i1 < 0 || --i2 < 0)
1009             break;
1010         }
1011
1012       /* Lower the highest of the two items.  */
1013       else if (src1->elems[i1] < src2->elems[i2])
1014         {
1015           if (--i2 < 0)
1016             break;
1017         }
1018       else
1019         {
1020           if (--i1 < 0)
1021             break;
1022         }
1023     }
1024
1025   id = dest->nelem - 1;
1026   is = dest->nelem + src1->nelem + src2->nelem - 1;
1027   delta = is - sbase + 1;
1028
1029   /* Now copy.  When DELTA becomes zero, the remaining
1030      DEST elements are already in place; this is more or
1031      less the same loop that is in re_node_set_merge.  */
1032   dest->nelem += delta;
1033   if (delta > 0 && id >= 0)
1034     for (;;)
1035       {
1036         if (dest->elems[is] > dest->elems[id])
1037           {
1038             /* Copy from the top.  */
1039             dest->elems[id + delta--] = dest->elems[is--];
1040             if (delta == 0)
1041               break;
1042           }
1043         else
1044           {
1045             /* Slide from the bottom.  */
1046             dest->elems[id + delta] = dest->elems[id];
1047             if (--id < 0)
1048               break;
1049           }
1050       }
1051
1052   /* Copy remaining SRC elements.  */
1053   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1054
1055   return REG_NOERROR;
1056 }
1057
1058 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1059    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1060
1061 static reg_errcode_t
1062 internal_function
1063 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1064                         const re_node_set *src2)
1065 {
1066   int i1, i2, id;
1067   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1068     {
1069       dest->alloc = src1->nelem + src2->nelem;
1070       dest->elems = re_malloc (int, dest->alloc);
1071       if (BE (dest->elems == NULL, 0))
1072         return REG_ESPACE;
1073     }
1074   else
1075     {
1076       if (src1 != NULL && src1->nelem > 0)
1077         return re_node_set_init_copy (dest, src1);
1078       else if (src2 != NULL && src2->nelem > 0)
1079         return re_node_set_init_copy (dest, src2);
1080       else
1081         re_node_set_init_empty (dest);
1082       return REG_NOERROR;
1083     }
1084   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1085     {
1086       if (src1->elems[i1] > src2->elems[i2])
1087         {
1088           dest->elems[id++] = src2->elems[i2++];
1089           continue;
1090         }
1091       if (src1->elems[i1] == src2->elems[i2])
1092         ++i2;
1093       dest->elems[id++] = src1->elems[i1++];
1094     }
1095   if (i1 < src1->nelem)
1096     {
1097       memcpy (dest->elems + id, src1->elems + i1,
1098              (src1->nelem - i1) * sizeof (int));
1099       id += src1->nelem - i1;
1100     }
1101   else if (i2 < src2->nelem)
1102     {
1103       memcpy (dest->elems + id, src2->elems + i2,
1104              (src2->nelem - i2) * sizeof (int));
1105       id += src2->nelem - i2;
1106     }
1107   dest->nelem = id;
1108   return REG_NOERROR;
1109 }
1110
1111 /* Calculate the union set of the sets DEST and SRC. And store it to
1112    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1113
1114 static reg_errcode_t
1115 internal_function
1116 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1117 {
1118   int is, id, sbase, delta;
1119   if (src == NULL || src->nelem == 0)
1120     return REG_NOERROR;
1121   if (dest->alloc < 2 * src->nelem + dest->nelem)
1122     {
1123       int new_alloc = 2 * (src->nelem + dest->alloc);
1124       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1125       if (BE (new_buffer == NULL, 0))
1126         return REG_ESPACE;
1127       dest->elems = new_buffer;
1128       dest->alloc = new_alloc;
1129     }
1130
1131   if (BE (dest->nelem == 0, 0))
1132     {
1133       dest->nelem = src->nelem;
1134       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1135       return REG_NOERROR;
1136     }
1137
1138   /* Copy into the top of DEST the items of SRC that are not
1139      found in DEST.  Maybe we could binary search in DEST?  */
1140   for (sbase = dest->nelem + 2 * src->nelem,
1141        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1142     {
1143       if (dest->elems[id] == src->elems[is])
1144         is--, id--;
1145       else if (dest->elems[id] < src->elems[is])
1146         dest->elems[--sbase] = src->elems[is--];
1147       else /* if (dest->elems[id] > src->elems[is]) */
1148         --id;
1149     }
1150
1151   if (is >= 0)
1152     {
1153       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1154       sbase -= is + 1;
1155       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1156     }
1157
1158   id = dest->nelem - 1;
1159   is = dest->nelem + 2 * src->nelem - 1;
1160   delta = is - sbase + 1;
1161   if (delta == 0)
1162     return REG_NOERROR;
1163
1164   /* Now copy.  When DELTA becomes zero, the remaining
1165      DEST elements are already in place.  */
1166   dest->nelem += delta;
1167   for (;;)
1168     {
1169       if (dest->elems[is] > dest->elems[id])
1170         {
1171           /* Copy from the top.  */
1172           dest->elems[id + delta--] = dest->elems[is--];
1173           if (delta == 0)
1174             break;
1175         }
1176       else
1177         {
1178           /* Slide from the bottom.  */
1179           dest->elems[id + delta] = dest->elems[id];
1180           if (--id < 0)
1181             {
1182               /* Copy remaining SRC elements.  */
1183               memcpy (dest->elems, dest->elems + sbase,
1184                       delta * sizeof (int));
1185               break;
1186             }
1187         }
1188     }
1189
1190   return REG_NOERROR;
1191 }
1192
1193 /* Insert the new element ELEM to the re_node_set* SET.
1194    SET should not already have ELEM.
1195    return -1 if an error is occured, return 1 otherwise.  */
1196
1197 static int
1198 internal_function
1199 re_node_set_insert (re_node_set *set, int elem)
1200 {
1201   int idx;
1202   /* In case the set is empty.  */
1203   if (set->alloc == 0)
1204     {
1205       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1206         return 1;
1207       else
1208         return -1;
1209     }
1210
1211   if (BE (set->nelem, 0) == 0)
1212     {
1213       /* We already guaranteed above that set->alloc != 0.  */
1214       set->elems[0] = elem;
1215       ++set->nelem;
1216       return 1;
1217     }
1218
1219   /* Realloc if we need.  */
1220   if (set->alloc == set->nelem)
1221     {
1222       int *new_elems;
1223       set->alloc = set->alloc * 2;
1224       new_elems = re_realloc (set->elems, int, set->alloc);
1225       if (BE (new_elems == NULL, 0))
1226         return -1;
1227       set->elems = new_elems;
1228     }
1229
1230   /* Move the elements which follows the new element.  Test the
1231      first element separately to skip a check in the inner loop.  */
1232   if (elem < set->elems[0])
1233     {
1234       idx = 0;
1235       for (idx = set->nelem; idx > 0; idx--)
1236         set->elems[idx] = set->elems[idx - 1];
1237     }
1238   else
1239     {
1240       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1241         set->elems[idx] = set->elems[idx - 1];
1242     }
1243
1244   /* Insert the new element.  */
1245   set->elems[idx] = elem;
1246   ++set->nelem;
1247   return 1;
1248 }
1249
1250 /* Insert the new element ELEM to the re_node_set* SET.
1251    SET should not already have any element greater than or equal to ELEM.
1252    Return -1 if an error is occured, return 1 otherwise.  */
1253
1254 static int
1255 internal_function
1256 re_node_set_insert_last (re_node_set *set, int elem)
1257 {
1258   /* Realloc if we need.  */
1259   if (set->alloc == set->nelem)
1260     {
1261       int *new_elems;
1262       set->alloc = (set->alloc + 1) * 2;
1263       new_elems = re_realloc (set->elems, int, set->alloc);
1264       if (BE (new_elems == NULL, 0))
1265         return -1;
1266       set->elems = new_elems;
1267     }
1268
1269   /* Insert the new element.  */
1270   set->elems[set->nelem++] = elem;
1271   return 1;
1272 }
1273
1274 /* Compare two node sets SET1 and SET2.
1275    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1276
1277 static int
1278 internal_function __attribute ((pure))
1279 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1280 {
1281   int i;
1282   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1283     return 0;
1284   for (i = set1->nelem ; --i >= 0 ; )
1285     if (set1->elems[i] != set2->elems[i])
1286       return 0;
1287   return 1;
1288 }
1289
1290 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1291
1292 static int
1293 internal_function __attribute ((pure))
1294 re_node_set_contains (const re_node_set *set, int elem)
1295 {
1296   unsigned int idx, right, mid;
1297   if (set->nelem <= 0)
1298     return 0;
1299
1300   /* Binary search the element.  */
1301   idx = 0;
1302   right = set->nelem - 1;
1303   while (idx < right)
1304     {
1305       mid = (idx + right) / 2;
1306       if (set->elems[mid] < elem)
1307         idx = mid + 1;
1308       else
1309         right = mid;
1310     }
1311   return set->elems[idx] == elem ? idx + 1 : 0;
1312 }
1313
1314 static void
1315 internal_function
1316 re_node_set_remove_at (re_node_set *set, int idx)
1317 {
1318   if (idx < 0 || idx >= set->nelem)
1319     return;
1320   --set->nelem;
1321   for (; idx < set->nelem; idx++)
1322     set->elems[idx] = set->elems[idx + 1];
1323 }
1324 \f
1325
1326 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1327    Or return -1, if an error will be occured.  */
1328
1329 static int
1330 internal_function
1331 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1332 {
1333   int type = token.type;
1334   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1335     {
1336       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1337       int *new_nexts, *new_indices;
1338       re_node_set *new_edests, *new_eclosures;
1339       re_token_t *new_nodes;
1340
1341       /* Avoid overflows.  */
1342       if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1343         return -1;
1344
1345       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1346       if (BE (new_nodes == NULL, 0))
1347         return -1;
1348       dfa->nodes = new_nodes;
1349       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1350       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1351       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1352       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1353       if (BE (new_nexts == NULL || new_indices == NULL
1354               || new_edests == NULL || new_eclosures == NULL, 0))
1355         return -1;
1356       dfa->nexts = new_nexts;
1357       dfa->org_indices = new_indices;
1358       dfa->edests = new_edests;
1359       dfa->eclosures = new_eclosures;
1360       dfa->nodes_alloc = new_nodes_alloc;
1361     }
1362   dfa->nodes[dfa->nodes_len] = token;
1363   dfa->nodes[dfa->nodes_len].constraint = 0;
1364 #ifdef RE_ENABLE_I18N
1365   dfa->nodes[dfa->nodes_len].accept_mb =
1366     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1367 #endif
1368   dfa->nexts[dfa->nodes_len] = -1;
1369   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1370   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1371   return dfa->nodes_len++;
1372 }
1373
1374 static inline unsigned int
1375 internal_function
1376 calc_state_hash (const re_node_set *nodes, unsigned int context)
1377 {
1378   unsigned int hash = nodes->nelem + context;
1379   int i;
1380   for (i = 0 ; i < nodes->nelem ; i++)
1381     hash += nodes->elems[i];
1382   return hash;
1383 }
1384
1385 /* Search for the state whose node_set is equivalent to NODES.
1386    Return the pointer to the state, if we found it in the DFA.
1387    Otherwise create the new one and return it.  In case of an error
1388    return NULL and set the error code in ERR.
1389    Note: - We assume NULL as the invalid state, then it is possible that
1390            return value is NULL and ERR is REG_NOERROR.
1391          - We never return non-NULL value in case of any errors, it is for
1392            optimization.  */
1393
1394 static re_dfastate_t *
1395 internal_function
1396 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1397                   const re_node_set *nodes)
1398 {
1399   unsigned int hash;
1400   re_dfastate_t *new_state;
1401   struct re_state_table_entry *spot;
1402   int i;
1403   if (BE (nodes->nelem == 0, 0))
1404     {
1405       *err = REG_NOERROR;
1406       return NULL;
1407     }
1408   hash = calc_state_hash (nodes, 0);
1409   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1410
1411   for (i = 0 ; i < spot->num ; i++)
1412     {
1413       re_dfastate_t *state = spot->array[i];
1414       if (hash != state->hash)
1415         continue;
1416       if (re_node_set_compare (&state->nodes, nodes))
1417         return state;
1418     }
1419
1420   /* There are no appropriate state in the dfa, create the new one.  */
1421   new_state = create_ci_newstate (dfa, nodes, hash);
1422   if (BE (new_state == NULL, 0))
1423     *err = REG_ESPACE;
1424
1425   return new_state;
1426 }
1427
1428 /* Search for the state whose node_set is equivalent to NODES and
1429    whose context is equivalent to CONTEXT.
1430    Return the pointer to the state, if we found it in the DFA.
1431    Otherwise create the new one and return it.  In case of an error
1432    return NULL and set the error code in ERR.
1433    Note: - We assume NULL as the invalid state, then it is possible that
1434            return value is NULL and ERR is REG_NOERROR.
1435          - We never return non-NULL value in case of any errors, it is for
1436            optimization.  */
1437
1438 static re_dfastate_t *
1439 internal_function
1440 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1441                           const re_node_set *nodes, unsigned int context)
1442 {
1443   unsigned int hash;
1444   re_dfastate_t *new_state;
1445   struct re_state_table_entry *spot;
1446   int i;
1447   if (nodes->nelem == 0)
1448     {
1449       *err = REG_NOERROR;
1450       return NULL;
1451     }
1452   hash = calc_state_hash (nodes, context);
1453   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1454
1455   for (i = 0 ; i < spot->num ; i++)
1456     {
1457       re_dfastate_t *state = spot->array[i];
1458       if (state->hash == hash
1459           && state->context == context
1460           && re_node_set_compare (state->entrance_nodes, nodes))
1461         return state;
1462     }
1463   /* There are no appropriate state in `dfa', create the new one.  */
1464   new_state = create_cd_newstate (dfa, nodes, context, hash);
1465   if (BE (new_state == NULL, 0))
1466     *err = REG_ESPACE;
1467
1468   return new_state;
1469 }
1470
1471 /* Finish initialization of the new state NEWSTATE, and using its hash value
1472    HASH put in the appropriate bucket of DFA's state table.  Return value
1473    indicates the error code if failed.  */
1474
1475 static reg_errcode_t
1476 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1477                 unsigned int hash)
1478 {
1479   struct re_state_table_entry *spot;
1480   reg_errcode_t err;
1481   int i;
1482
1483   newstate->hash = hash;
1484   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1485   if (BE (err != REG_NOERROR, 0))
1486     return REG_ESPACE;
1487   for (i = 0; i < newstate->nodes.nelem; i++)
1488     {
1489       int elem = newstate->nodes.elems[i];
1490       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1491         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1492     }
1493
1494   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495   if (BE (spot->alloc <= spot->num, 0))
1496     {
1497       int new_alloc = 2 * spot->num + 2;
1498       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1499                                               new_alloc);
1500       if (BE (new_array == NULL, 0))
1501         return REG_ESPACE;
1502       spot->array = new_array;
1503       spot->alloc = new_alloc;
1504     }
1505   spot->array[spot->num++] = newstate;
1506   return REG_NOERROR;
1507 }
1508
1509 static void
1510 free_state (re_dfastate_t *state)
1511 {
1512   re_node_set_free (&state->non_eps_nodes);
1513   re_node_set_free (&state->inveclosure);
1514   if (state->entrance_nodes != &state->nodes)
1515     {
1516       re_node_set_free (state->entrance_nodes);
1517       re_free (state->entrance_nodes);
1518     }
1519   re_node_set_free (&state->nodes);
1520   re_free (state->word_trtable);
1521   re_free (state->trtable);
1522   re_free (state);
1523 }
1524
1525 /* Create the new state which is independ of contexts.
1526    Return the new state if succeeded, otherwise return NULL.  */
1527
1528 static re_dfastate_t *
1529 internal_function
1530 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1531                     unsigned int hash)
1532 {
1533   int i;
1534   reg_errcode_t err;
1535   re_dfastate_t *newstate;
1536
1537   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1538   if (BE (newstate == NULL, 0))
1539     return NULL;
1540   err = re_node_set_init_copy (&newstate->nodes, nodes);
1541   if (BE (err != REG_NOERROR, 0))
1542     {
1543       re_free (newstate);
1544       return NULL;
1545     }
1546
1547   newstate->entrance_nodes = &newstate->nodes;
1548   for (i = 0 ; i < nodes->nelem ; i++)
1549     {
1550       re_token_t *node = dfa->nodes + nodes->elems[i];
1551       re_token_type_t type = node->type;
1552       if (type == CHARACTER && !node->constraint)
1553         continue;
1554 #ifdef RE_ENABLE_I18N
1555       newstate->accept_mb |= node->accept_mb;
1556 #endif /* RE_ENABLE_I18N */
1557
1558       /* If the state has the halt node, the state is a halt state.  */
1559       if (type == END_OF_RE)
1560         newstate->halt = 1;
1561       else if (type == OP_BACK_REF)
1562         newstate->has_backref = 1;
1563       else if (type == ANCHOR || node->constraint)
1564         newstate->has_constraint = 1;
1565     }
1566   err = register_state (dfa, newstate, hash);
1567   if (BE (err != REG_NOERROR, 0))
1568     {
1569       free_state (newstate);
1570       newstate = NULL;
1571     }
1572   return newstate;
1573 }
1574
1575 /* Create the new state which is depend on the context CONTEXT.
1576    Return the new state if succeeded, otherwise return NULL.  */
1577
1578 static re_dfastate_t *
1579 internal_function
1580 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1581                     unsigned int context, unsigned int hash)
1582 {
1583   int i, nctx_nodes = 0;
1584   reg_errcode_t err;
1585   re_dfastate_t *newstate;
1586
1587   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1588   if (BE (newstate == NULL, 0))
1589     return NULL;
1590   err = re_node_set_init_copy (&newstate->nodes, nodes);
1591   if (BE (err != REG_NOERROR, 0))
1592     {
1593       re_free (newstate);
1594       return NULL;
1595     }
1596
1597   newstate->context = context;
1598   newstate->entrance_nodes = &newstate->nodes;
1599
1600   for (i = 0 ; i < nodes->nelem ; i++)
1601     {
1602       unsigned int constraint = 0;
1603       re_token_t *node = dfa->nodes + nodes->elems[i];
1604       re_token_type_t type = node->type;
1605       if (node->constraint)
1606         constraint = node->constraint;
1607
1608       if (type == CHARACTER && !constraint)
1609         continue;
1610 #ifdef RE_ENABLE_I18N
1611       newstate->accept_mb |= node->accept_mb;
1612 #endif /* RE_ENABLE_I18N */
1613
1614       /* If the state has the halt node, the state is a halt state.  */
1615       if (type == END_OF_RE)
1616         newstate->halt = 1;
1617       else if (type == OP_BACK_REF)
1618         newstate->has_backref = 1;
1619       else if (type == ANCHOR)
1620         constraint = node->opr.ctx_type;
1621
1622       if (constraint)
1623         {
1624           if (newstate->entrance_nodes == &newstate->nodes)
1625             {
1626               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1627               if (BE (newstate->entrance_nodes == NULL, 0))
1628                 {
1629                   free_state (newstate);
1630                   return NULL;
1631                 }
1632               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1633               nctx_nodes = 0;
1634               newstate->has_constraint = 1;
1635             }
1636
1637           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1638             {
1639               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1640               ++nctx_nodes;
1641             }
1642         }
1643     }
1644   err = register_state (dfa, newstate, hash);
1645   if (BE (err != REG_NOERROR, 0))
1646     {
1647       free_state (newstate);
1648       newstate = NULL;
1649     }
1650   return  newstate;
1651 }