posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static void re_string_construct_common (const char *str, int len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, int icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  26                                           const re_node_set *nodes,
  27                                           unsigned int hash) internal_function;
  28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  29                                           const re_node_set *nodes,
  30                                           unsigned int context,
  31                                           unsigned int hash) internal_function;
  32 \f
  33 /* Functions for string operation.  */
  34
  35 /* This function allocate the buffers.  It is necessary to call
  36    re_string_reconstruct before using the object.  */
  37
  38 static reg_errcode_t
  39 internal_function
  40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  41                     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  42 {
  43   reg_errcode_t ret;
  44   int init_buf_len;
  45
  46   /* Ensure at least one character fits into the buffers.  */
  47   if (init_len < dfa->mb_cur_max)
  48     init_len = dfa->mb_cur_max;
  49   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  50   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  51
  52   ret = re_string_realloc_buffers (pstr, init_buf_len);
  53   if (BE (ret != REG_NOERROR, 0))
  54     return ret;
  55
  56   pstr->word_char = dfa->word_char;
  57   pstr->word_ops_used = dfa->word_ops_used;
  58   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  59   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  60   pstr->valid_raw_len = pstr->valid_len;
  61   return REG_NOERROR;
  62 }
  63
  64 /* This function allocate the buffers, and initialize them.  */
  65
  66 static reg_errcode_t
  67 internal_function
  68 re_string_construct (re_string_t *pstr, const char *str, int len,
  69                      RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  70 {
  71   reg_errcode_t ret;
  72   memset (pstr, '\0', sizeof (re_string_t));
  73   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  74
  75   if (len > 0)
  76     {
  77       ret = re_string_realloc_buffers (pstr, len + 1);
  78       if (BE (ret != REG_NOERROR, 0))
  79         return ret;
  80     }
  81   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  82
  83   if (icase)
  84     {
  85 #ifdef RE_ENABLE_I18N
  86       if (dfa->mb_cur_max > 1)
  87         {
  88           while (1)
  89             {
  90               ret = build_wcs_upper_buffer (pstr);
  91               if (BE (ret != REG_NOERROR, 0))
  92                 return ret;
  93               if (pstr->valid_raw_len >= len)
  94                 break;
  95               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  96                 break;
  97               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  98               if (BE (ret != REG_NOERROR, 0))
  99                 return ret;
 100             }
 101         }
 102       else
 103 #endif /* RE_ENABLE_I18N  */
 104         build_upper_buffer (pstr);
 105     }
 106   else
 107     {
 108 #ifdef RE_ENABLE_I18N
 109       if (dfa->mb_cur_max > 1)
 110         build_wcs_buffer (pstr);
 111       else
 112 #endif /* RE_ENABLE_I18N  */
 113         {
 114           if (trans != NULL)
 115             re_string_translate_buffer (pstr);
 116           else
 117             {
 118               pstr->valid_len = pstr->bufs_len;
 119               pstr->valid_raw_len = pstr->bufs_len;
 120             }
 121         }
 122     }
 123
 124   return REG_NOERROR;
 125 }
 126
 127 /* Helper functions for re_string_allocate, and re_string_construct.  */
 128
 129 static reg_errcode_t
 130 internal_function
 131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
 132 {
 133 #ifdef RE_ENABLE_I18N
 134   if (pstr->mb_cur_max > 1)
 135     {
 136       wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 137       if (BE (new_wcs == NULL, 0))
 138         return REG_ESPACE;
 139       pstr->wcs = new_wcs;
 140       if (pstr->offsets != NULL)
 141         {
 142           int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
 143           if (BE (new_offsets == NULL, 0))
 144             return REG_ESPACE;
 145           pstr->offsets = new_offsets;
 146         }
 147     }
 148 #endif /* RE_ENABLE_I18N  */
 149   if (pstr->mbs_allocated)
 150     {
 151       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 152                                            new_buf_len);
 153       if (BE (new_mbs == NULL, 0))
 154         return REG_ESPACE;
 155       pstr->mbs = new_mbs;
 156     }
 157   pstr->bufs_len = new_buf_len;
 158   return REG_NOERROR;
 159 }
 160
 161
 162 static void
 163 internal_function
 164 re_string_construct_common (const char *str, int len, re_string_t *pstr,
 165                             RE_TRANSLATE_TYPE trans, int icase,
 166                             const re_dfa_t *dfa)
 167 {
 168   pstr->raw_mbs = (const unsigned char *) str;
 169   pstr->len = len;
 170   pstr->raw_len = len;
 171   pstr->trans = trans;
 172   pstr->icase = icase ? 1 : 0;
 173   pstr->mbs_allocated = (trans != NULL || icase);
 174   pstr->mb_cur_max = dfa->mb_cur_max;
 175   pstr->is_utf8 = dfa->is_utf8;
 176   pstr->map_notascii = dfa->map_notascii;
 177   pstr->stop = pstr->len;
 178   pstr->raw_stop = pstr->stop;
 179 }
 180
 181 #ifdef RE_ENABLE_I18N
 182
 183 /* Build wide character buffer PSTR->WCS.
 184    If the byte sequence of the string are:
 185      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 186    Then wide character buffer will be:
 187      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 188    We use WEOF for padding, they indicate that the position isn't
 189    a first byte of a multibyte character.
 190
 191    Note that this function assumes PSTR->VALID_LEN elements are already
 192    built and starts from PSTR->VALID_LEN.  */
 193
 194 static void
 195 internal_function
 196 build_wcs_buffer (re_string_t *pstr)
 197 {
 198 #ifdef _LIBC
 199   unsigned char buf[MB_LEN_MAX];
 200   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 201 #else
 202   unsigned char buf[64];
 203 #endif
 204   mbstate_t prev_st;
 205   int byte_idx, end_idx, remain_len;
 206   size_t mbclen;
 207
 208   /* Build the buffers from pstr->valid_len to either pstr->len or
 209      pstr->bufs_len.  */
 210   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 211   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 212     {
 213       wchar_t wc;
 214       const char *p;
 215
 216       remain_len = end_idx - byte_idx;
 217       prev_st = pstr->cur_state;
 218       /* Apply the translation if we need.  */
 219       if (BE (pstr->trans != NULL, 0))
 220         {
 221           int i, ch;
 222
 223           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 224             {
 225               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 226               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 227             }
 228           p = (const char *) buf;
 229         }
 230       else
 231         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 232       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 233       if (BE (mbclen == (size_t) -2, 0))
 234         {
 235           /* The buffer doesn't have enough space, finish to build.  */
 236           pstr->cur_state = prev_st;
 237           break;
 238         }
 239       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 240         {
 241           /* We treat these cases as a singlebyte character.  */
 242           mbclen = 1;
 243           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 244           if (BE (pstr->trans != NULL, 0))
 245             wc = pstr->trans[wc];
 246           pstr->cur_state = prev_st;
 247         }
 248
 249       /* Write wide character and padding.  */
 250       pstr->wcs[byte_idx++] = wc;
 251       /* Write paddings.  */
 252       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 253         pstr->wcs[byte_idx++] = WEOF;
 254     }
 255   pstr->valid_len = byte_idx;
 256   pstr->valid_raw_len = byte_idx;
 257 }
 258
 259 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 260    but for REG_ICASE.  */
 261
 262 static reg_errcode_t
 263 internal_function
 264 build_wcs_upper_buffer (re_string_t *pstr)
 265 {
 266   mbstate_t prev_st;
 267   int src_idx, byte_idx, end_idx, remain_len;
 268   size_t mbclen;
 269 #ifdef _LIBC
 270   char buf[MB_LEN_MAX];
 271   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 272 #else
 273   char buf[64];
 274 #endif
 275
 276   byte_idx = pstr->valid_len;
 277   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 278
 279   /* The following optimization assumes that ASCII characters can be
 280      mapped to wide characters with a simple cast.  */
 281   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 282     {
 283       while (byte_idx < end_idx)
 284         {
 285           wchar_t wc;
 286
 287           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 288               && mbsinit (&pstr->cur_state))
 289             {
 290               /* In case of a singlebyte character.  */
 291               pstr->mbs[byte_idx]
 292                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 293               /* The next step uses the assumption that wchar_t is encoded
 294                  ASCII-safe: all ASCII values can be converted like this.  */
 295               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 296               ++byte_idx;
 297               continue;
 298             }
 299
 300           remain_len = end_idx - byte_idx;
 301           prev_st = pstr->cur_state;
 302           mbclen = mbrtowc (&wc,
 303                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 304                              + byte_idx), remain_len, &pstr->cur_state);
 305           if (BE (mbclen + 2 > 2, 1))
 306             {
 307               wchar_t wcu = wc;
 308               if (iswlower (wc))
 309                 {
 310                   size_t mbcdlen;
 311
 312                   wcu = towupper (wc);
 313                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 314                   if (BE (mbclen == mbcdlen, 1))
 315                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 316                   else
 317                     {
 318                       src_idx = byte_idx;
 319                       goto offsets_needed;
 320                     }
 321                 }
 322               else
 323                 memcpy (pstr->mbs + byte_idx,
 324                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 325               pstr->wcs[byte_idx++] = wcu;
 326               /* Write paddings.  */
 327               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 328                 pstr->wcs[byte_idx++] = WEOF;
 329             }
 330           else if (mbclen == (size_t) -1 || mbclen == 0)
 331             {
 332               /* It is an invalid character or '\0'.  Just use the byte.  */
 333               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 334               pstr->mbs[byte_idx] = ch;
 335               /* And also cast it to wide char.  */
 336               pstr->wcs[byte_idx++] = (wchar_t) ch;
 337               if (BE (mbclen == (size_t) -1, 0))
 338                 pstr->cur_state = prev_st;
 339             }
 340           else
 341             {
 342               /* The buffer doesn't have enough space, finish to build.  */
 343               pstr->cur_state = prev_st;
 344               break;
 345             }
 346         }
 347       pstr->valid_len = byte_idx;
 348       pstr->valid_raw_len = byte_idx;
 349       return REG_NOERROR;
 350     }
 351   else
 352     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 353       {
 354         wchar_t wc;
 355         const char *p;
 356       offsets_needed:
 357         remain_len = end_idx - byte_idx;
 358         prev_st = pstr->cur_state;
 359         if (BE (pstr->trans != NULL, 0))
 360           {
 361             int i, ch;
 362
 363             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 364               {
 365                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 366                 buf[i] = pstr->trans[ch];
 367               }
 368             p = (const char *) buf;
 369           }
 370         else
 371           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 372         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 373         if (BE (mbclen + 2 > 2, 1))
 374           {
 375             wchar_t wcu = wc;
 376             if (iswlower (wc))
 377               {
 378                 size_t mbcdlen;
 379
 380                 wcu = towupper (wc);
 381                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 382                 if (BE (mbclen == mbcdlen, 1))
 383                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 384                 else if (mbcdlen != (size_t) -1)
 385                   {
 386                     size_t i;
 387
 388                     if (byte_idx + mbcdlen > pstr->bufs_len)
 389                       {
 390                         pstr->cur_state = prev_st;
 391                         break;
 392                       }
 393
 394                     if (pstr->offsets == NULL)
 395                       {
 396                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 397
 398                         if (pstr->offsets == NULL)
 399                           return REG_ESPACE;
 400                       }
 401                     if (!pstr->offsets_needed)
 402                       {
 403                         for (i = 0; i < (size_t) byte_idx; ++i)
 404                           pstr->offsets[i] = i;
 405                         pstr->offsets_needed = 1;
 406                       }
 407
 408                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 409                     pstr->wcs[byte_idx] = wcu;
 410                     pstr->offsets[byte_idx] = src_idx;
 411                     for (i = 1; i < mbcdlen; ++i)
 412                       {
 413                         pstr->offsets[byte_idx + i]
 414                           = src_idx + (i < mbclen ? i : mbclen - 1);
 415                         pstr->wcs[byte_idx + i] = WEOF;
 416                       }
 417                     pstr->len += mbcdlen - mbclen;
 418                     if (pstr->raw_stop > src_idx)
 419                       pstr->stop += mbcdlen - mbclen;
 420                     end_idx = (pstr->bufs_len > pstr->len)
 421                               ? pstr->len : pstr->bufs_len;
 422                     byte_idx += mbcdlen;
 423                     src_idx += mbclen;
 424                     continue;
 425                   }
 426                 else
 427                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 428               }
 429             else
 430               memcpy (pstr->mbs + byte_idx, p, mbclen);
 431
 432             if (BE (pstr->offsets_needed != 0, 0))
 433               {
 434                 size_t i;
 435                 for (i = 0; i < mbclen; ++i)
 436                   pstr->offsets[byte_idx + i] = src_idx + i;
 437               }
 438             src_idx += mbclen;
 439
 440             pstr->wcs[byte_idx++] = wcu;
 441             /* Write paddings.  */
 442             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 443               pstr->wcs[byte_idx++] = WEOF;
 444           }
 445         else if (mbclen == (size_t) -1 || mbclen == 0)
 446           {
 447             /* It is an invalid character or '\0'.  Just use the byte.  */
 448             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 449
 450             if (BE (pstr->trans != NULL, 0))
 451               ch = pstr->trans [ch];
 452             pstr->mbs[byte_idx] = ch;
 453
 454             if (BE (pstr->offsets_needed != 0, 0))
 455               pstr->offsets[byte_idx] = src_idx;
 456             ++src_idx;
 457
 458             /* And also cast it to wide char.  */
 459             pstr->wcs[byte_idx++] = (wchar_t) ch;
 460             if (BE (mbclen == (size_t) -1, 0))
 461               pstr->cur_state = prev_st;
 462           }
 463         else
 464           {
 465             /* The buffer doesn't have enough space, finish to build.  */
 466             pstr->cur_state = prev_st;
 467             break;
 468           }
 469       }
 470   pstr->valid_len = byte_idx;
 471   pstr->valid_raw_len = src_idx;
 472   return REG_NOERROR;
 473 }
 474
 475 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 476    Return the index.  */
 477
 478 static int
 479 internal_function
 480 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
 481 {
 482   mbstate_t prev_st;
 483   int rawbuf_idx;
 484   size_t mbclen;
 485   wchar_t wc = 0;
 486
 487   /* Skip the characters which are not necessary to check.  */
 488   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 489        rawbuf_idx < new_raw_idx;)
 490     {
 491       int remain_len;
 492       remain_len = pstr->len - rawbuf_idx;
 493       prev_st = pstr->cur_state;
 494       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 495                         remain_len, &pstr->cur_state);
 496       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 497         {
 498           /* We treat these cases as a singlebyte character.  */
 499           mbclen = 1;
 500           pstr->cur_state = prev_st;
 501         }
 502       /* Then proceed the next character.  */
 503       rawbuf_idx += mbclen;
 504     }
 505   *last_wc = (wint_t) wc;
 506   return rawbuf_idx;
 507 }
 508 #endif /* RE_ENABLE_I18N  */
 509
 510 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 511    This function is used in case of REG_ICASE.  */
 512
 513 static void
 514 internal_function
 515 build_upper_buffer (re_string_t *pstr)
 516 {
 517   int char_idx, end_idx;
 518   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 519
 520   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 521     {
 522       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 523       if (BE (pstr->trans != NULL, 0))
 524         ch = pstr->trans[ch];
 525       if (islower (ch))
 526         pstr->mbs[char_idx] = toupper (ch);
 527       else
 528         pstr->mbs[char_idx] = ch;
 529     }
 530   pstr->valid_len = char_idx;
 531   pstr->valid_raw_len = char_idx;
 532 }
 533
 534 /* Apply TRANS to the buffer in PSTR.  */
 535
 536 static void
 537 internal_function
 538 re_string_translate_buffer (re_string_t *pstr)
 539 {
 540   int buf_idx, end_idx;
 541   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 542
 543   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 544     {
 545       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 546       pstr->mbs[buf_idx] = pstr->trans[ch];
 547     }
 548
 549   pstr->valid_len = buf_idx;
 550   pstr->valid_raw_len = buf_idx;
 551 }
 552
 553 /* This function re-construct the buffers.
 554    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 555    convert to upper case in case of REG_ICASE, apply translation.  */
 556
 557 static reg_errcode_t
 558 internal_function
 559 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 560 {
 561   int offset = idx - pstr->raw_mbs_idx;
 562   if (BE (offset < 0, 0))
 563     {
 564       /* Reset buffer.  */
 565 #ifdef RE_ENABLE_I18N
 566       if (pstr->mb_cur_max > 1)
 567         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 568 #endif /* RE_ENABLE_I18N */
 569       pstr->len = pstr->raw_len;
 570       pstr->stop = pstr->raw_stop;
 571       pstr->valid_len = 0;
 572       pstr->raw_mbs_idx = 0;
 573       pstr->valid_raw_len = 0;
 574       pstr->offsets_needed = 0;
 575       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 576                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 577       if (!pstr->mbs_allocated)
 578         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 579       offset = idx;
 580     }
 581
 582   if (BE (offset != 0, 1))
 583     {
 584       /* Are the characters which are already checked remain?  */
 585       if (BE (offset < pstr->valid_raw_len, 1)
 586 #ifdef RE_ENABLE_I18N
 587           /* Handling this would enlarge the code too much.
 588              Accept a slowdown in that case.  */
 589           && pstr->offsets_needed == 0
 590 #endif
 591          )
 592         {
 593           /* Yes, move them to the front of the buffer.  */
 594           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 595 #ifdef RE_ENABLE_I18N
 596           if (pstr->mb_cur_max > 1)
 597             memmove (pstr->wcs, pstr->wcs + offset,
 598                      (pstr->valid_len - offset) * sizeof (wint_t));
 599 #endif /* RE_ENABLE_I18N */
 600           if (BE (pstr->mbs_allocated, 0))
 601             memmove (pstr->mbs, pstr->mbs + offset,
 602                      pstr->valid_len - offset);
 603           pstr->valid_len -= offset;
 604           pstr->valid_raw_len -= offset;
 605 #if DEBUG
 606           assert (pstr->valid_len > 0);
 607 #endif
 608         }
 609       else
 610         {
 611           /* No, skip all characters until IDX.  */
 612 #ifdef RE_ENABLE_I18N
 613           if (BE (pstr->offsets_needed, 0))
 614             {
 615               pstr->len = pstr->raw_len - idx + offset;
 616               pstr->stop = pstr->raw_stop - idx + offset;
 617               pstr->offsets_needed = 0;
 618             }
 619 #endif
 620           pstr->valid_len = 0;
 621           pstr->valid_raw_len = 0;
 622 #ifdef RE_ENABLE_I18N
 623           if (pstr->mb_cur_max > 1)
 624             {
 625               int wcs_idx;
 626               wint_t wc = WEOF;
 627
 628               if (pstr->is_utf8)
 629                 {
 630                   const unsigned char *raw, *p, *q, *end;
 631
 632                   /* Special case UTF-8.  Multi-byte chars start with any
 633                      byte other than 0x80 - 0xbf.  */
 634                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 635                   end = raw + (offset - pstr->mb_cur_max);
 636                   p = raw + offset - 1;
 637 #ifdef _LIBC
 638                   /* We know the wchar_t encoding is UCS4, so for the simple
 639                      case, ASCII characters, skip the conversion step.  */
 640                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 641                     {
 642                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 643                       pstr->valid_len = 0;
 644                       wc = (wchar_t) *p;
 645                     }
 646                   else
 647 #endif
 648                     for (; p >= end; --p)
 649                       if ((*p & 0xc0) != 0x80)
 650                         {
 651                           mbstate_t cur_state;
 652                           wchar_t wc2;
 653                           int mlen = raw + pstr->len - p;
 654                           unsigned char buf[6];
 655                           size_t mbclen;
 656
 657                           q = p;
 658                           if (BE (pstr->trans != NULL, 0))
 659                             {
 660                               int i = mlen < 6 ? mlen : 6;
 661                               while (--i >= 0)
 662                                 buf[i] = pstr->trans[p[i]];
 663                               q = buf;
 664                             }
 665                           /* XXX Don't use mbrtowc, we know which conversion
 666                              to use (UTF-8 -> UCS4).  */
 667                           memset (&cur_state, 0, sizeof (cur_state));
 668                           mbclen = mbrtowc (&wc2, (const char *) p, mlen,
 669                                             &cur_state);
 670                           if (raw + offset - p <= mbclen
 671                               && mbclen < (size_t) -2)
 672                             {
 673                               memset (&pstr->cur_state, '\0',
 674                                       sizeof (mbstate_t));
 675                               pstr->valid_len = mbclen - (raw + offset - p);
 676                               wc = wc2;
 677                             }
 678                           break;
 679                         }
 680                 }
 681
 682               if (wc == WEOF)
 683                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 684               if (BE (pstr->valid_len, 0))
 685                 {
 686                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 687                     pstr->wcs[wcs_idx] = WEOF;
 688                   if (pstr->mbs_allocated)
 689                     memset (pstr->mbs, 255, pstr->valid_len);
 690                 }
 691               pstr->valid_raw_len = pstr->valid_len;
 692               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 693                                     && IS_WIDE_WORD_CHAR (wc))
 694                                    ? CONTEXT_WORD
 695                                    : ((IS_WIDE_NEWLINE (wc)
 696                                        && pstr->newline_anchor)
 697                                       ? CONTEXT_NEWLINE : 0));
 698             }
 699           else
 700 #endif /* RE_ENABLE_I18N */
 701             {
 702               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 703               if (pstr->trans)
 704                 c = pstr->trans[c];
 705               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 706                                    ? CONTEXT_WORD
 707                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 708                                       ? CONTEXT_NEWLINE : 0));
 709             }
 710         }
 711       if (!BE (pstr->mbs_allocated, 0))
 712         pstr->mbs += offset;
 713     }
 714   pstr->raw_mbs_idx = idx;
 715   pstr->len -= offset;
 716   pstr->stop -= offset;
 717
 718   /* Then build the buffers.  */
 719 #ifdef RE_ENABLE_I18N
 720   if (pstr->mb_cur_max > 1)
 721     {
 722       if (pstr->icase)
 723         {
 724           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 725           if (BE (ret != REG_NOERROR, 0))
 726             return ret;
 727         }
 728       else
 729         build_wcs_buffer (pstr);
 730     }
 731   else
 732 #endif /* RE_ENABLE_I18N */
 733     if (BE (pstr->mbs_allocated, 0))
 734       {
 735         if (pstr->icase)
 736           build_upper_buffer (pstr);
 737         else if (pstr->trans != NULL)
 738           re_string_translate_buffer (pstr);
 739       }
 740     else
 741       pstr->valid_len = pstr->len;
 742
 743   pstr->cur_idx = 0;
 744   return REG_NOERROR;
 745 }
 746
 747 static unsigned char
 748 internal_function __attribute ((pure))
 749 re_string_peek_byte_case (const re_string_t *pstr, int idx)
 750 {
 751   int ch, off;
 752
 753   /* Handle the common (easiest) cases first.  */
 754   if (BE (!pstr->mbs_allocated, 1))
 755     return re_string_peek_byte (pstr, idx);
 756
 757 #ifdef RE_ENABLE_I18N
 758   if (pstr->mb_cur_max > 1
 759       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 760     return re_string_peek_byte (pstr, idx);
 761 #endif
 762
 763   off = pstr->cur_idx + idx;
 764 #ifdef RE_ENABLE_I18N
 765   if (pstr->offsets_needed)
 766     off = pstr->offsets[off];
 767 #endif
 768
 769   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 770
 771 #ifdef RE_ENABLE_I18N
 772   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 773      this function returns CAPITAL LETTER I instead of first byte of
 774      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 775      since peek_byte_case doesn't advance cur_idx in any way.  */
 776   if (pstr->offsets_needed && !isascii (ch))
 777     return re_string_peek_byte (pstr, idx);
 778 #endif
 779
 780   return ch;
 781 }
 782
 783 static unsigned char
 784 internal_function __attribute ((pure))
 785 re_string_fetch_byte_case (re_string_t *pstr)
 786 {
 787   if (BE (!pstr->mbs_allocated, 1))
 788     return re_string_fetch_byte (pstr);
 789
 790 #ifdef RE_ENABLE_I18N
 791   if (pstr->offsets_needed)
 792     {
 793       int off, ch;
 794
 795       /* For tr_TR.UTF-8 [[:islower:]] there is
 796          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 797          in that case the whole multi-byte character and return
 798          the original letter.  On the other side, with
 799          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 800          anything else would complicate things too much.  */
 801
 802       if (!re_string_first_byte (pstr, pstr->cur_idx))
 803         return re_string_fetch_byte (pstr);
 804
 805       off = pstr->offsets[pstr->cur_idx];
 806       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 807
 808       if (! isascii (ch))
 809         return re_string_fetch_byte (pstr);
 810
 811       re_string_skip_bytes (pstr,
 812                             re_string_char_size_at (pstr, pstr->cur_idx));
 813       return ch;
 814     }
 815 #endif
 816
 817   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 818 }
 819
 820 static void
 821 internal_function
 822 re_string_destruct (re_string_t *pstr)
 823 {
 824 #ifdef RE_ENABLE_I18N
 825   re_free (pstr->wcs);
 826   re_free (pstr->offsets);
 827 #endif /* RE_ENABLE_I18N  */
 828   if (pstr->mbs_allocated)
 829     re_free (pstr->mbs);
 830 }
 831
 832 /* Return the context at IDX in INPUT.  */
 833
 834 static unsigned int
 835 internal_function
 836 re_string_context_at (const re_string_t *input, int idx, int eflags)
 837 {
 838   int c;
 839   if (BE (idx < 0, 0))
 840     /* In this case, we use the value stored in input->tip_context,
 841        since we can't know the character in input->mbs[-1] here.  */
 842     return input->tip_context;
 843   if (BE (idx == input->len, 0))
 844     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 845             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 846 #ifdef RE_ENABLE_I18N
 847   if (input->mb_cur_max > 1)
 848     {
 849       wint_t wc;
 850       int wc_idx = idx;
 851       while(input->wcs[wc_idx] == WEOF)
 852         {
 853 #ifdef DEBUG
 854           /* It must not happen.  */
 855           assert (wc_idx >= 0);
 856 #endif
 857           --wc_idx;
 858           if (wc_idx < 0)
 859             return input->tip_context;
 860         }
 861       wc = input->wcs[wc_idx];
 862       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 863         return CONTEXT_WORD;
 864       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 865               ? CONTEXT_NEWLINE : 0);
 866     }
 867   else
 868 #endif
 869     {
 870       c = re_string_byte_at (input, idx);
 871       if (bitset_contain (input->word_char, c))
 872         return CONTEXT_WORD;
 873       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 874     }
 875 }
 876 \f
 877 /* Functions for set operation.  */
 878
 879 static reg_errcode_t
 880 internal_function
 881 re_node_set_alloc (re_node_set *set, int size)
 882 {
 883   set->alloc = size;
 884   set->nelem = 0;
 885   set->elems = re_malloc (int, size);
 886   if (BE (set->elems == NULL, 0))
 887     return REG_ESPACE;
 888   return REG_NOERROR;
 889 }
 890
 891 static reg_errcode_t
 892 internal_function
 893 re_node_set_init_1 (re_node_set *set, int elem)
 894 {
 895   set->alloc = 1;
 896   set->nelem = 1;
 897   set->elems = re_malloc (int, 1);
 898   if (BE (set->elems == NULL, 0))
 899     {
 900       set->alloc = set->nelem = 0;
 901       return REG_ESPACE;
 902     }
 903   set->elems[0] = elem;
 904   return REG_NOERROR;
 905 }
 906
 907 static reg_errcode_t
 908 internal_function
 909 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
 910 {
 911   set->alloc = 2;
 912   set->elems = re_malloc (int, 2);
 913   if (BE (set->elems == NULL, 0))
 914     return REG_ESPACE;
 915   if (elem1 == elem2)
 916     {
 917       set->nelem = 1;
 918       set->elems[0] = elem1;
 919     }
 920   else
 921     {
 922       set->nelem = 2;
 923       if (elem1 < elem2)
 924         {
 925           set->elems[0] = elem1;
 926           set->elems[1] = elem2;
 927         }
 928       else
 929         {
 930           set->elems[0] = elem2;
 931           set->elems[1] = elem1;
 932         }
 933     }
 934   return REG_NOERROR;
 935 }
 936
 937 static reg_errcode_t
 938 internal_function
 939 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 940 {
 941   dest->nelem = src->nelem;
 942   if (src->nelem > 0)
 943     {
 944       dest->alloc = dest->nelem;
 945       dest->elems = re_malloc (int, dest->alloc);
 946       if (BE (dest->elems == NULL, 0))
 947         {
 948           dest->alloc = dest->nelem = 0;
 949           return REG_ESPACE;
 950         }
 951       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
 952     }
 953   else
 954     re_node_set_init_empty (dest);
 955   return REG_NOERROR;
 956 }
 957
 958 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 959    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 960    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 961
 962 static reg_errcode_t
 963 internal_function
 964 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
 965                            const re_node_set *src2)
 966 {
 967   int i1, i2, is, id, delta, sbase;
 968   if (src1->nelem == 0 || src2->nelem == 0)
 969     return REG_NOERROR;
 970
 971   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 972      conservative estimate.  */
 973   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 974     {
 975       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
 976       int *new_elems = re_realloc (dest->elems, int, new_alloc);
 977       if (BE (new_elems == NULL, 0))
 978         return REG_ESPACE;
 979       dest->elems = new_elems;
 980       dest->alloc = new_alloc;
 981     }
 982
 983   /* Find the items in the intersection of SRC1 and SRC2, and copy
 984      into the top of DEST those that are not already in DEST itself.  */
 985   sbase = dest->nelem + src1->nelem + src2->nelem;
 986   i1 = src1->nelem - 1;
 987   i2 = src2->nelem - 1;
 988   id = dest->nelem - 1;
 989   for (;;)
 990     {
 991       if (src1->elems[i1] == src2->elems[i2])
 992         {
 993           /* Try to find the item in DEST.  Maybe we could binary search?  */
 994           while (id >= 0 && dest->elems[id] > src1->elems[i1])
 995             --id;
 996
 997           if (id < 0 || dest->elems[id] != src1->elems[i1])
 998             dest->elems[--sbase] = src1->elems[i1];
 999
1000           if (--i1 < 0 || --i2 < 0)
1001             break;
1002         }
1003
1004       /* Lower the highest of the two items.  */
1005       else if (src1->elems[i1] < src2->elems[i2])
1006         {
1007           if (--i2 < 0)
1008             break;
1009         }
1010       else
1011         {
1012           if (--i1 < 0)
1013             break;
1014         }
1015     }
1016
1017   id = dest->nelem - 1;
1018   is = dest->nelem + src1->nelem + src2->nelem - 1;
1019   delta = is - sbase + 1;
1020
1021   /* Now copy.  When DELTA becomes zero, the remaining
1022      DEST elements are already in place; this is more or
1023      less the same loop that is in re_node_set_merge.  */
1024   dest->nelem += delta;
1025   if (delta > 0 && id >= 0)
1026     for (;;)
1027       {
1028         if (dest->elems[is] > dest->elems[id])
1029           {
1030             /* Copy from the top.  */
1031             dest->elems[id + delta--] = dest->elems[is--];
1032             if (delta == 0)
1033               break;
1034           }
1035         else
1036           {
1037             /* Slide from the bottom.  */
1038             dest->elems[id + delta] = dest->elems[id];
1039             if (--id < 0)
1040               break;
1041           }
1042       }
1043
1044   /* Copy remaining SRC elements.  */
1045   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1046
1047   return REG_NOERROR;
1048 }
1049
1050 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1051    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1052
1053 static reg_errcode_t
1054 internal_function
1055 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1056                         const re_node_set *src2)
1057 {
1058   int i1, i2, id;
1059   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1060     {
1061       dest->alloc = src1->nelem + src2->nelem;
1062       dest->elems = re_malloc (int, dest->alloc);
1063       if (BE (dest->elems == NULL, 0))
1064         return REG_ESPACE;
1065     }
1066   else
1067     {
1068       if (src1 != NULL && src1->nelem > 0)
1069         return re_node_set_init_copy (dest, src1);
1070       else if (src2 != NULL && src2->nelem > 0)
1071         return re_node_set_init_copy (dest, src2);
1072       else
1073         re_node_set_init_empty (dest);
1074       return REG_NOERROR;
1075     }
1076   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1077     {
1078       if (src1->elems[i1] > src2->elems[i2])
1079         {
1080           dest->elems[id++] = src2->elems[i2++];
1081           continue;
1082         }
1083       if (src1->elems[i1] == src2->elems[i2])
1084         ++i2;
1085       dest->elems[id++] = src1->elems[i1++];
1086     }
1087   if (i1 < src1->nelem)
1088     {
1089       memcpy (dest->elems + id, src1->elems + i1,
1090              (src1->nelem - i1) * sizeof (int));
1091       id += src1->nelem - i1;
1092     }
1093   else if (i2 < src2->nelem)
1094     {
1095       memcpy (dest->elems + id, src2->elems + i2,
1096              (src2->nelem - i2) * sizeof (int));
1097       id += src2->nelem - i2;
1098     }
1099   dest->nelem = id;
1100   return REG_NOERROR;
1101 }
1102
1103 /* Calculate the union set of the sets DEST and SRC. And store it to
1104    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1105
1106 static reg_errcode_t
1107 internal_function
1108 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1109 {
1110   int is, id, sbase, delta;
1111   if (src == NULL || src->nelem == 0)
1112     return REG_NOERROR;
1113   if (dest->alloc < 2 * src->nelem + dest->nelem)
1114     {
1115       int new_alloc = 2 * (src->nelem + dest->alloc);
1116       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1117       if (BE (new_buffer == NULL, 0))
1118         return REG_ESPACE;
1119       dest->elems = new_buffer;
1120       dest->alloc = new_alloc;
1121     }
1122
1123   if (BE (dest->nelem == 0, 0))
1124     {
1125       dest->nelem = src->nelem;
1126       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1127       return REG_NOERROR;
1128     }
1129
1130   /* Copy into the top of DEST the items of SRC that are not
1131      found in DEST.  Maybe we could binary search in DEST?  */
1132   for (sbase = dest->nelem + 2 * src->nelem,
1133        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1134     {
1135       if (dest->elems[id] == src->elems[is])
1136         is--, id--;
1137       else if (dest->elems[id] < src->elems[is])
1138         dest->elems[--sbase] = src->elems[is--];
1139       else /* if (dest->elems[id] > src->elems[is]) */
1140         --id;
1141     }
1142
1143   if (is >= 0)
1144     {
1145       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1146       sbase -= is + 1;
1147       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1148     }
1149
1150   id = dest->nelem - 1;
1151   is = dest->nelem + 2 * src->nelem - 1;
1152   delta = is - sbase + 1;
1153   if (delta == 0)
1154     return REG_NOERROR;
1155
1156   /* Now copy.  When DELTA becomes zero, the remaining
1157      DEST elements are already in place.  */
1158   dest->nelem += delta;
1159   for (;;)
1160     {
1161       if (dest->elems[is] > dest->elems[id])
1162         {
1163           /* Copy from the top.  */
1164           dest->elems[id + delta--] = dest->elems[is--];
1165           if (delta == 0)
1166             break;
1167         }
1168       else
1169         {
1170           /* Slide from the bottom.  */
1171           dest->elems[id + delta] = dest->elems[id];
1172           if (--id < 0)
1173             {
1174               /* Copy remaining SRC elements.  */
1175               memcpy (dest->elems, dest->elems + sbase,
1176                       delta * sizeof (int));
1177               break;
1178             }
1179         }
1180     }
1181
1182   return REG_NOERROR;
1183 }
1184
1185 /* Insert the new element ELEM to the re_node_set* SET.
1186    SET should not already have ELEM.
1187    return -1 if an error is occured, return 1 otherwise.  */
1188
1189 static int
1190 internal_function
1191 re_node_set_insert (re_node_set *set, int elem)
1192 {
1193   int idx;
1194   /* In case the set is empty.  */
1195   if (set->alloc == 0)
1196     {
1197       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1198         return 1;
1199       else
1200         return -1;
1201     }
1202
1203   if (BE (set->nelem, 0) == 0)
1204     {
1205       /* We already guaranteed above that set->alloc != 0.  */
1206       set->elems[0] = elem;
1207       ++set->nelem;
1208       return 1;
1209     }
1210
1211   /* Realloc if we need.  */
1212   if (set->alloc == set->nelem)
1213     {
1214       int *new_elems;
1215       set->alloc = set->alloc * 2;
1216       new_elems = re_realloc (set->elems, int, set->alloc);
1217       if (BE (new_elems == NULL, 0))
1218         return -1;
1219       set->elems = new_elems;
1220     }
1221
1222   /* Move the elements which follows the new element.  Test the
1223      first element separately to skip a check in the inner loop.  */
1224   if (elem < set->elems[0])
1225     {
1226       idx = 0;
1227       for (idx = set->nelem; idx > 0; idx--)
1228         set->elems[idx] = set->elems[idx - 1];
1229     }
1230   else
1231     {
1232       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1233         set->elems[idx] = set->elems[idx - 1];
1234     }
1235
1236   /* Insert the new element.  */
1237   set->elems[idx] = elem;
1238   ++set->nelem;
1239   return 1;
1240 }
1241
1242 /* Insert the new element ELEM to the re_node_set* SET.
1243    SET should not already have any element greater than or equal to ELEM.
1244    Return -1 if an error is occured, return 1 otherwise.  */
1245
1246 static int
1247 internal_function
1248 re_node_set_insert_last (re_node_set *set, int elem)
1249 {
1250   /* Realloc if we need.  */
1251   if (set->alloc == set->nelem)
1252     {
1253       int *new_elems;
1254       set->alloc = (set->alloc + 1) * 2;
1255       new_elems = re_realloc (set->elems, int, set->alloc);
1256       if (BE (new_elems == NULL, 0))
1257         return -1;
1258       set->elems = new_elems;
1259     }
1260
1261   /* Insert the new element.  */
1262   set->elems[set->nelem++] = elem;
1263   return 1;
1264 }
1265
1266 /* Compare two node sets SET1 and SET2.
1267    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1268
1269 static int
1270 internal_function __attribute ((pure))
1271 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1272 {
1273   int i;
1274   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1275     return 0;
1276   for (i = set1->nelem ; --i >= 0 ; )
1277     if (set1->elems[i] != set2->elems[i])
1278       return 0;
1279   return 1;
1280 }
1281
1282 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1283
1284 static int
1285 internal_function __attribute ((pure))
1286 re_node_set_contains (const re_node_set *set, int elem)
1287 {
1288   unsigned int idx, right, mid;
1289   if (set->nelem <= 0)
1290     return 0;
1291
1292   /* Binary search the element.  */
1293   idx = 0;
1294   right = set->nelem - 1;
1295   while (idx < right)
1296     {
1297       mid = (idx + right) / 2;
1298       if (set->elems[mid] < elem)
1299         idx = mid + 1;
1300       else
1301         right = mid;
1302     }
1303   return set->elems[idx] == elem ? idx + 1 : 0;
1304 }
1305
1306 static void
1307 internal_function
1308 re_node_set_remove_at (re_node_set *set, int idx)
1309 {
1310   if (idx < 0 || idx >= set->nelem)
1311     return;
1312   --set->nelem;
1313   for (; idx < set->nelem; idx++)
1314     set->elems[idx] = set->elems[idx + 1];
1315 }
1316 \f
1317
1318 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1319    Or return -1, if an error will be occured.  */
1320
1321 static int
1322 internal_function
1323 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1324 {
1325   int type = token.type;
1326   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1327     {
1328       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1329       int *new_nexts, *new_indices;
1330       re_node_set *new_edests, *new_eclosures;
1331       re_token_t *new_nodes;
1332
1333       /* Avoid overflows.  */
1334       if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1335         return -1;
1336
1337       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1338       if (BE (new_nodes == NULL, 0))
1339         return -1;
1340       dfa->nodes = new_nodes;
1341       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1342       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1343       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1344       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1345       if (BE (new_nexts == NULL || new_indices == NULL
1346               || new_edests == NULL || new_eclosures == NULL, 0))
1347         return -1;
1348       dfa->nexts = new_nexts;
1349       dfa->org_indices = new_indices;
1350       dfa->edests = new_edests;
1351       dfa->eclosures = new_eclosures;
1352       dfa->nodes_alloc = new_nodes_alloc;
1353     }
1354   dfa->nodes[dfa->nodes_len] = token;
1355   dfa->nodes[dfa->nodes_len].constraint = 0;
1356 #ifdef RE_ENABLE_I18N
1357   dfa->nodes[dfa->nodes_len].accept_mb =
1358     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1359 #endif
1360   dfa->nexts[dfa->nodes_len] = -1;
1361   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1362   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1363   return dfa->nodes_len++;
1364 }
1365
1366 static inline unsigned int
1367 internal_function
1368 calc_state_hash (const re_node_set *nodes, unsigned int context)
1369 {
1370   unsigned int hash = nodes->nelem + context;
1371   int i;
1372   for (i = 0 ; i < nodes->nelem ; i++)
1373     hash += nodes->elems[i];
1374   return hash;
1375 }
1376
1377 /* Search for the state whose node_set is equivalent to NODES.
1378    Return the pointer to the state, if we found it in the DFA.
1379    Otherwise create the new one and return it.  In case of an error
1380    return NULL and set the error code in ERR.
1381    Note: - We assume NULL as the invalid state, then it is possible that
1382            return value is NULL and ERR is REG_NOERROR.
1383          - We never return non-NULL value in case of any errors, it is for
1384            optimization.  */
1385
1386 static re_dfastate_t *
1387 internal_function
1388 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1389                   const re_node_set *nodes)
1390 {
1391   unsigned int hash;
1392   re_dfastate_t *new_state;
1393   struct re_state_table_entry *spot;
1394   int i;
1395   if (BE (nodes->nelem == 0, 0))
1396     {
1397       *err = REG_NOERROR;
1398       return NULL;
1399     }
1400   hash = calc_state_hash (nodes, 0);
1401   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1402
1403   for (i = 0 ; i < spot->num ; i++)
1404     {
1405       re_dfastate_t *state = spot->array[i];
1406       if (hash != state->hash)
1407         continue;
1408       if (re_node_set_compare (&state->nodes, nodes))
1409         return state;
1410     }
1411
1412   /* There are no appropriate state in the dfa, create the new one.  */
1413   new_state = create_ci_newstate (dfa, nodes, hash);
1414   if (BE (new_state == NULL, 0))
1415     *err = REG_ESPACE;
1416
1417   return new_state;
1418 }
1419
1420 /* Search for the state whose node_set is equivalent to NODES and
1421    whose context is equivalent to CONTEXT.
1422    Return the pointer to the state, if we found it in the DFA.
1423    Otherwise create the new one and return it.  In case of an error
1424    return NULL and set the error code in ERR.
1425    Note: - We assume NULL as the invalid state, then it is possible that
1426            return value is NULL and ERR is REG_NOERROR.
1427          - We never return non-NULL value in case of any errors, it is for
1428            optimization.  */
1429
1430 static re_dfastate_t *
1431 internal_function
1432 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1433                           const re_node_set *nodes, unsigned int context)
1434 {
1435   unsigned int hash;
1436   re_dfastate_t *new_state;
1437   struct re_state_table_entry *spot;
1438   int i;
1439   if (nodes->nelem == 0)
1440     {
1441       *err = REG_NOERROR;
1442       return NULL;
1443     }
1444   hash = calc_state_hash (nodes, context);
1445   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1446
1447   for (i = 0 ; i < spot->num ; i++)
1448     {
1449       re_dfastate_t *state = spot->array[i];
1450       if (state->hash == hash
1451           && state->context == context
1452           && re_node_set_compare (state->entrance_nodes, nodes))
1453         return state;
1454     }
1455   /* There are no appropriate state in `dfa', create the new one.  */
1456   new_state = create_cd_newstate (dfa, nodes, context, hash);
1457   if (BE (new_state == NULL, 0))
1458     *err = REG_ESPACE;
1459
1460   return new_state;
1461 }
1462
1463 /* Finish initialization of the new state NEWSTATE, and using its hash value
1464    HASH put in the appropriate bucket of DFA's state table.  Return value
1465    indicates the error code if failed.  */
1466
1467 static reg_errcode_t
1468 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1469                 unsigned int hash)
1470 {
1471   struct re_state_table_entry *spot;
1472   reg_errcode_t err;
1473   int i;
1474
1475   newstate->hash = hash;
1476   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1477   if (BE (err != REG_NOERROR, 0))
1478     return REG_ESPACE;
1479   for (i = 0; i < newstate->nodes.nelem; i++)
1480     {
1481       int elem = newstate->nodes.elems[i];
1482       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1483         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1484     }
1485
1486   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1487   if (BE (spot->alloc <= spot->num, 0))
1488     {
1489       int new_alloc = 2 * spot->num + 2;
1490       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1491                                               new_alloc);
1492       if (BE (new_array == NULL, 0))
1493         return REG_ESPACE;
1494       spot->array = new_array;
1495       spot->alloc = new_alloc;
1496     }
1497   spot->array[spot->num++] = newstate;
1498   return REG_NOERROR;
1499 }
1500
1501 static void
1502 free_state (re_dfastate_t *state)
1503 {
1504   re_node_set_free (&state->non_eps_nodes);
1505   re_node_set_free (&state->inveclosure);
1506   if (state->entrance_nodes != &state->nodes)
1507     {
1508       re_node_set_free (state->entrance_nodes);
1509       re_free (state->entrance_nodes);
1510     }
1511   re_node_set_free (&state->nodes);
1512   re_free (state->word_trtable);
1513   re_free (state->trtable);
1514   re_free (state);
1515 }
1516
1517 /* Create the new state which is independ of contexts.
1518    Return the new state if succeeded, otherwise return NULL.  */
1519
1520 static re_dfastate_t *
1521 internal_function
1522 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1523                     unsigned int hash)
1524 {
1525   int i;
1526   reg_errcode_t err;
1527   re_dfastate_t *newstate;
1528
1529   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1530   if (BE (newstate == NULL, 0))
1531     return NULL;
1532   err = re_node_set_init_copy (&newstate->nodes, nodes);
1533   if (BE (err != REG_NOERROR, 0))
1534     {
1535       re_free (newstate);
1536       return NULL;
1537     }
1538
1539   newstate->entrance_nodes = &newstate->nodes;
1540   for (i = 0 ; i < nodes->nelem ; i++)
1541     {
1542       re_token_t *node = dfa->nodes + nodes->elems[i];
1543       re_token_type_t type = node->type;
1544       if (type == CHARACTER && !node->constraint)
1545         continue;
1546 #ifdef RE_ENABLE_I18N
1547       newstate->accept_mb |= node->accept_mb;
1548 #endif /* RE_ENABLE_I18N */
1549
1550       /* If the state has the halt node, the state is a halt state.  */
1551       if (type == END_OF_RE)
1552         newstate->halt = 1;
1553       else if (type == OP_BACK_REF)
1554         newstate->has_backref = 1;
1555       else if (type == ANCHOR || node->constraint)
1556         newstate->has_constraint = 1;
1557     }
1558   err = register_state (dfa, newstate, hash);
1559   if (BE (err != REG_NOERROR, 0))
1560     {
1561       free_state (newstate);
1562       newstate = NULL;
1563     }
1564   return newstate;
1565 }
1566
1567 /* Create the new state which is depend on the context CONTEXT.
1568    Return the new state if succeeded, otherwise return NULL.  */
1569
1570 static re_dfastate_t *
1571 internal_function
1572 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1573                     unsigned int context, unsigned int hash)
1574 {
1575   int i, nctx_nodes = 0;
1576   reg_errcode_t err;
1577   re_dfastate_t *newstate;
1578
1579   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1580   if (BE (newstate == NULL, 0))
1581     return NULL;
1582   err = re_node_set_init_copy (&newstate->nodes, nodes);
1583   if (BE (err != REG_NOERROR, 0))
1584     {
1585       re_free (newstate);
1586       return NULL;
1587     }
1588
1589   newstate->context = context;
1590   newstate->entrance_nodes = &newstate->nodes;
1591
1592   for (i = 0 ; i < nodes->nelem ; i++)
1593     {
1594       unsigned int constraint = 0;
1595       re_token_t *node = dfa->nodes + nodes->elems[i];
1596       re_token_type_t type = node->type;
1597       if (node->constraint)
1598         constraint = node->constraint;
1599
1600       if (type == CHARACTER && !constraint)
1601         continue;
1602 #ifdef RE_ENABLE_I18N
1603       newstate->accept_mb |= node->accept_mb;
1604 #endif /* RE_ENABLE_I18N */
1605
1606       /* If the state has the halt node, the state is a halt state.  */
1607       if (type == END_OF_RE)
1608         newstate->halt = 1;
1609       else if (type == OP_BACK_REF)
1610         newstate->has_backref = 1;
1611       else if (type == ANCHOR)
1612         constraint = node->opr.ctx_type;
1613
1614       if (constraint)
1615         {
1616           if (newstate->entrance_nodes == &newstate->nodes)
1617             {
1618               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1619               if (BE (newstate->entrance_nodes == NULL, 0))
1620                 {
1621                   free_state (newstate);
1622                   return NULL;
1623                 }
1624               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1625               nctx_nodes = 0;
1626               newstate->has_constraint = 1;
1627             }
1628
1629           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1630             {
1631               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1632               ++nctx_nodes;
1633             }
1634         }
1635     }
1636   err = register_state (dfa, newstate, hash);
1637   if (BE (err != REG_NOERROR, 0))
1638     {
1639       free_state (newstate);
1640       newstate = NULL;
1641     }
1642   return  newstate;
1643 }