posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static void re_string_construct_common (const char *str, int len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, int icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 #ifdef RE_ENABLE_I18N
  26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
  27                                  wint_t *last_wc) internal_function;
  28 #endif /* RE_ENABLE_I18N */
  29 static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
  30                                               const re_node_set *nodes,
  31                                               unsigned int hash) internal_function;
  32 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
  33                                      unsigned int hash) internal_function;
  34 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
  35                                           const re_node_set *nodes,
  36                                           unsigned int hash) internal_function;
  37 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
  38                                           const re_node_set *nodes,
  39                                           unsigned int context,
  40                                           unsigned int hash) internal_function;
  41 static unsigned int inline calc_state_hash (const re_node_set *nodes,
  42                                             unsigned int context) internal_function;
  43 \f
  44 /* Functions for string operation.  */
  45
  46 /* This function allocate the buffers.  It is necessary to call
  47    re_string_reconstruct before using the object.  */
  48
  49 static reg_errcode_t
  50 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
  51      re_string_t *pstr;
  52      const char *str;
  53      int len, init_len, icase;
  54      RE_TRANSLATE_TYPE trans;
  55      const re_dfa_t *dfa;
  56 {
  57   reg_errcode_t ret;
  58   int init_buf_len;
  59
  60   /* Ensure at least one character fits into the buffers.  */
  61   if (init_len < dfa->mb_cur_max)
  62     init_len = dfa->mb_cur_max;
  63   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  64   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  65
  66   ret = re_string_realloc_buffers (pstr, init_buf_len);
  67   if (BE (ret != REG_NOERROR, 0))
  68     return ret;
  69
  70   pstr->word_char = dfa->word_char;
  71   pstr->word_ops_used = dfa->word_ops_used;
  72   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  73   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  74   pstr->valid_raw_len = pstr->valid_len;
  75   return REG_NOERROR;
  76 }
  77
  78 /* This function allocate the buffers, and initialize them.  */
  79
  80 static reg_errcode_t
  81 re_string_construct (pstr, str, len, trans, icase, dfa)
  82      re_string_t *pstr;
  83      const char *str;
  84      int len, icase;
  85      RE_TRANSLATE_TYPE trans;
  86      const re_dfa_t *dfa;
  87 {
  88   reg_errcode_t ret;
  89   memset (pstr, '\0', sizeof (re_string_t));
  90   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  91
  92   if (len > 0)
  93     {
  94       ret = re_string_realloc_buffers (pstr, len + 1);
  95       if (BE (ret != REG_NOERROR, 0))
  96         return ret;
  97     }
  98   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  99
 100   if (icase)
 101     {
 102 #ifdef RE_ENABLE_I18N
 103       if (dfa->mb_cur_max > 1)
 104         {
 105           while (1)
 106             {
 107               ret = build_wcs_upper_buffer (pstr);
 108               if (BE (ret != REG_NOERROR, 0))
 109                 return ret;
 110               if (pstr->valid_raw_len >= len)
 111                 break;
 112               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 113                 break;
 114               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 115               if (BE (ret != REG_NOERROR, 0))
 116                 return ret;
 117             }
 118         }
 119       else
 120 #endif /* RE_ENABLE_I18N  */
 121         build_upper_buffer (pstr);
 122     }
 123   else
 124     {
 125 #ifdef RE_ENABLE_I18N
 126       if (dfa->mb_cur_max > 1)
 127         build_wcs_buffer (pstr);
 128       else
 129 #endif /* RE_ENABLE_I18N  */
 130         {
 131           if (trans != NULL)
 132             re_string_translate_buffer (pstr);
 133           else
 134             {
 135               pstr->valid_len = pstr->bufs_len;
 136               pstr->valid_raw_len = pstr->bufs_len;
 137             }
 138         }
 139     }
 140
 141   return REG_NOERROR;
 142 }
 143
 144 /* Helper functions for re_string_allocate, and re_string_construct.  */
 145
 146 static reg_errcode_t
 147 re_string_realloc_buffers (pstr, new_buf_len)
 148      re_string_t *pstr;
 149      int new_buf_len;
 150 {
 151 #ifdef RE_ENABLE_I18N
 152   if (pstr->mb_cur_max > 1)
 153     {
 154       wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
 155       if (BE (new_array == NULL, 0))
 156         return REG_ESPACE;
 157       pstr->wcs = new_array;
 158       if (pstr->offsets != NULL)
 159         {
 160           int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
 161           if (BE (new_array == NULL, 0))
 162             return REG_ESPACE;
 163           pstr->offsets = new_array;
 164         }
 165     }
 166 #endif /* RE_ENABLE_I18N  */
 167   if (pstr->mbs_allocated)
 168     {
 169       unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
 170                                              new_buf_len);
 171       if (BE (new_array == NULL, 0))
 172         return REG_ESPACE;
 173       pstr->mbs = new_array;
 174     }
 175   pstr->bufs_len = new_buf_len;
 176   return REG_NOERROR;
 177 }
 178
 179
 180 static void
 181 re_string_construct_common (str, len, pstr, trans, icase, dfa)
 182      const char *str;
 183      int len;
 184      re_string_t *pstr;
 185      RE_TRANSLATE_TYPE trans;
 186      int icase;
 187      const re_dfa_t *dfa;
 188 {
 189   pstr->raw_mbs = (const unsigned char *) str;
 190   pstr->len = len;
 191   pstr->raw_len = len;
 192   pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
 193   pstr->icase = icase ? 1 : 0;
 194   pstr->mbs_allocated = (trans != NULL || icase);
 195   pstr->mb_cur_max = dfa->mb_cur_max;
 196   pstr->is_utf8 = dfa->is_utf8;
 197   pstr->map_notascii = dfa->map_notascii;
 198   pstr->stop = pstr->len;
 199   pstr->raw_stop = pstr->stop;
 200 }
 201
 202 #ifdef RE_ENABLE_I18N
 203
 204 /* Build wide character buffer PSTR->WCS.
 205    If the byte sequence of the string are:
 206      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 207    Then wide character buffer will be:
 208      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 209    We use WEOF for padding, they indicate that the position isn't
 210    a first byte of a multibyte character.
 211
 212    Note that this function assumes PSTR->VALID_LEN elements are already
 213    built and starts from PSTR->VALID_LEN.  */
 214
 215 static void
 216 build_wcs_buffer (pstr)
 217      re_string_t *pstr;
 218 {
 219 #ifdef _LIBC
 220   unsigned char buf[pstr->mb_cur_max];
 221 #else
 222   unsigned char buf[64];
 223 #endif
 224   mbstate_t prev_st;
 225   int byte_idx, end_idx, mbclen, remain_len;
 226
 227   /* Build the buffers from pstr->valid_len to either pstr->len or
 228      pstr->bufs_len.  */
 229   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 230   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 231     {
 232       wchar_t wc;
 233       const char *p;
 234
 235       remain_len = end_idx - byte_idx;
 236       prev_st = pstr->cur_state;
 237       /* Apply the translation if we need.  */
 238       if (BE (pstr->trans != NULL, 0))
 239         {
 240           int i, ch;
 241
 242           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 243             {
 244               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 245               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 246             }
 247           p = (const char *) buf;
 248         }
 249       else
 250         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 251       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 252       if (BE (mbclen == (size_t) -2, 0))
 253         {
 254           /* The buffer doesn't have enough space, finish to build.  */
 255           pstr->cur_state = prev_st;
 256           break;
 257         }
 258       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 259         {
 260           /* We treat these cases as a singlebyte character.  */
 261           mbclen = 1;
 262           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 263           if (BE (pstr->trans != NULL, 0))
 264             wc = pstr->trans[wc];
 265           pstr->cur_state = prev_st;
 266         }
 267
 268       /* Write wide character and padding.  */
 269       pstr->wcs[byte_idx++] = wc;
 270       /* Write paddings.  */
 271       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 272         pstr->wcs[byte_idx++] = WEOF;
 273     }
 274   pstr->valid_len = byte_idx;
 275   pstr->valid_raw_len = byte_idx;
 276 }
 277
 278 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 279    but for REG_ICASE.  */
 280
 281 static int
 282 build_wcs_upper_buffer (pstr)
 283      re_string_t *pstr;
 284 {
 285   mbstate_t prev_st;
 286   int src_idx, byte_idx, end_idx, mbclen, remain_len;
 287 #ifdef _LIBC
 288   unsigned char buf[pstr->mb_cur_max];
 289 #else
 290   unsigned char buf[64];
 291 #endif
 292
 293   byte_idx = pstr->valid_len;
 294   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 295
 296   /* The following optimization assumes that ASCII characters can be
 297      mapped to wide characters with a simple cast.  */
 298   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 299     {
 300       while (byte_idx < end_idx)
 301         {
 302           wchar_t wc;
 303
 304           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 305               && mbsinit (&pstr->cur_state))
 306             {
 307               /* In case of a singlebyte character.  */
 308               pstr->mbs[byte_idx]
 309                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 310               /* The next step uses the assumption that wchar_t is encoded
 311                  ASCII-safe: all ASCII values can be converted like this.  */
 312               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 313               ++byte_idx;
 314               continue;
 315             }
 316
 317           remain_len = end_idx - byte_idx;
 318           prev_st = pstr->cur_state;
 319           mbclen = mbrtowc (&wc,
 320                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 321                              + byte_idx), remain_len, &pstr->cur_state);
 322           if (BE (mbclen > 0, 1))
 323             {
 324               wchar_t wcu = wc;
 325               if (iswlower (wc))
 326                 {
 327                   int mbcdlen;
 328
 329                   wcu = towupper (wc);
 330                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 331                   if (BE (mbclen == mbcdlen, 1))
 332                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 333                   else
 334                     {
 335                       src_idx = byte_idx;
 336                       goto offsets_needed;
 337                     }
 338                 }
 339               else
 340                 memcpy (pstr->mbs + byte_idx,
 341                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 342               pstr->wcs[byte_idx++] = wcu;
 343               /* Write paddings.  */
 344               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 345                 pstr->wcs[byte_idx++] = WEOF;
 346             }
 347           else if (mbclen == (size_t) -1 || mbclen == 0)
 348             {
 349               /* It is an invalid character or '\0'.  Just use the byte.  */
 350               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 351               pstr->mbs[byte_idx] = ch;
 352               /* And also cast it to wide char.  */
 353               pstr->wcs[byte_idx++] = (wchar_t) ch;
 354               if (BE (mbclen == (size_t) -1, 0))
 355                 pstr->cur_state = prev_st;
 356             }
 357           else
 358             {
 359               /* The buffer doesn't have enough space, finish to build.  */
 360               pstr->cur_state = prev_st;
 361               break;
 362             }
 363         }
 364       pstr->valid_len = byte_idx;
 365       pstr->valid_raw_len = byte_idx;
 366       return REG_NOERROR;
 367     }
 368   else
 369     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 370       {
 371         wchar_t wc;
 372         const char *p;
 373       offsets_needed:
 374         remain_len = end_idx - byte_idx;
 375         prev_st = pstr->cur_state;
 376         if (BE (pstr->trans != NULL, 0))
 377           {
 378             int i, ch;
 379
 380             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 381               {
 382                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 383                 buf[i] = pstr->trans[ch];
 384               }
 385             p = (const char *) buf;
 386           }
 387         else
 388           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 389         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 390         if (BE (mbclen > 0, 1))
 391           {
 392             wchar_t wcu = wc;
 393             if (iswlower (wc))
 394               {
 395                 int mbcdlen;
 396
 397                 wcu = towupper (wc);
 398                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 399                 if (BE (mbclen == mbcdlen, 1))
 400                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 401                 else
 402                   {
 403                     int i;
 404
 405                     if (byte_idx + mbcdlen > pstr->bufs_len)
 406                       {
 407                         pstr->cur_state = prev_st;
 408                         break;
 409                       }
 410
 411                     if (pstr->offsets == NULL)
 412                       {
 413                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 414
 415                         if (pstr->offsets == NULL)
 416                           return REG_ESPACE;
 417                       }
 418                     if (!pstr->offsets_needed)
 419                       {
 420                         for (i = 0; i < byte_idx; ++i)
 421                           pstr->offsets[i] = i;
 422                         pstr->offsets_needed = 1;
 423                       }
 424
 425                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 426                     pstr->wcs[byte_idx] = wcu;
 427                     pstr->offsets[byte_idx] = src_idx;
 428                     for (i = 1; i < mbcdlen; ++i)
 429                       {
 430                         pstr->offsets[byte_idx + i]
 431                           = src_idx + (i < mbclen ? i : mbclen - 1);
 432                         pstr->wcs[byte_idx + i] = WEOF;
 433                       }
 434                     pstr->len += mbcdlen - mbclen;
 435                     if (pstr->raw_stop > src_idx)
 436                       pstr->stop += mbcdlen - mbclen;
 437                     end_idx = (pstr->bufs_len > pstr->len)
 438                               ? pstr->len : pstr->bufs_len;
 439                     byte_idx += mbcdlen;
 440                     src_idx += mbclen;
 441                     continue;
 442                   }
 443               }
 444             else
 445               memcpy (pstr->mbs + byte_idx, p, mbclen);
 446
 447             if (BE (pstr->offsets_needed != 0, 0))
 448               {
 449                 int i;
 450                 for (i = 0; i < mbclen; ++i)
 451                   pstr->offsets[byte_idx + i] = src_idx + i;
 452               }
 453             src_idx += mbclen;
 454
 455             pstr->wcs[byte_idx++] = wcu;
 456             /* Write paddings.  */
 457             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 458               pstr->wcs[byte_idx++] = WEOF;
 459           }
 460         else if (mbclen == (size_t) -1 || mbclen == 0)
 461           {
 462             /* It is an invalid character or '\0'.  Just use the byte.  */
 463             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 464
 465             if (BE (pstr->trans != NULL, 0))
 466               ch = pstr->trans [ch];
 467             pstr->mbs[byte_idx] = ch;
 468
 469             if (BE (pstr->offsets_needed != 0, 0))
 470               pstr->offsets[byte_idx] = src_idx;
 471             ++src_idx;
 472
 473             /* And also cast it to wide char.  */
 474             pstr->wcs[byte_idx++] = (wchar_t) ch;
 475             if (BE (mbclen == (size_t) -1, 0))
 476               pstr->cur_state = prev_st;
 477           }
 478         else
 479           {
 480             /* The buffer doesn't have enough space, finish to build.  */
 481             pstr->cur_state = prev_st;
 482             break;
 483           }
 484       }
 485   pstr->valid_len = byte_idx;
 486   pstr->valid_raw_len = src_idx;
 487   return REG_NOERROR;
 488 }
 489
 490 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 491    Return the index.  */
 492
 493 static int
 494 re_string_skip_chars (pstr, new_raw_idx, last_wc)
 495      re_string_t *pstr;
 496      int new_raw_idx;
 497      wint_t *last_wc;
 498 {
 499   mbstate_t prev_st;
 500   int rawbuf_idx, mbclen;
 501   wchar_t wc = 0;
 502
 503   /* Skip the characters which are not necessary to check.  */
 504   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 505        rawbuf_idx < new_raw_idx;)
 506     {
 507       int remain_len;
 508       remain_len = pstr->len - rawbuf_idx;
 509       prev_st = pstr->cur_state;
 510       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 511                         remain_len, &pstr->cur_state);
 512       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 513         {
 514           /* We treat these cases as a singlebyte character.  */
 515           mbclen = 1;
 516           pstr->cur_state = prev_st;
 517         }
 518       /* Then proceed the next character.  */
 519       rawbuf_idx += mbclen;
 520     }
 521   *last_wc = (wint_t) wc;
 522   return rawbuf_idx;
 523 }
 524 #endif /* RE_ENABLE_I18N  */
 525
 526 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 527    This function is used in case of REG_ICASE.  */
 528
 529 static void
 530 build_upper_buffer (pstr)
 531      re_string_t *pstr;
 532 {
 533   int char_idx, end_idx;
 534   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 535
 536   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 537     {
 538       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 539       if (BE (pstr->trans != NULL, 0))
 540         ch = pstr->trans[ch];
 541       if (islower (ch))
 542         pstr->mbs[char_idx] = toupper (ch);
 543       else
 544         pstr->mbs[char_idx] = ch;
 545     }
 546   pstr->valid_len = char_idx;
 547   pstr->valid_raw_len = char_idx;
 548 }
 549
 550 /* Apply TRANS to the buffer in PSTR.  */
 551
 552 static void
 553 re_string_translate_buffer (pstr)
 554      re_string_t *pstr;
 555 {
 556   int buf_idx, end_idx;
 557   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 558
 559   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 560     {
 561       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 562       pstr->mbs[buf_idx] = pstr->trans[ch];
 563     }
 564
 565   pstr->valid_len = buf_idx;
 566   pstr->valid_raw_len = buf_idx;
 567 }
 568
 569 /* This function re-construct the buffers.
 570    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 571    convert to upper case in case of REG_ICASE, apply translation.  */
 572
 573 static reg_errcode_t
 574 re_string_reconstruct (pstr, idx, eflags)
 575      re_string_t *pstr;
 576      int idx, eflags;
 577 {
 578   int offset = idx - pstr->raw_mbs_idx;
 579   if (BE (offset < 0, 0))
 580     {
 581       /* Reset buffer.  */
 582 #ifdef RE_ENABLE_I18N
 583       if (pstr->mb_cur_max > 1)
 584         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 585 #endif /* RE_ENABLE_I18N */
 586       pstr->len = pstr->raw_len;
 587       pstr->stop = pstr->raw_stop;
 588       pstr->valid_len = 0;
 589       pstr->raw_mbs_idx = 0;
 590       pstr->valid_raw_len = 0;
 591       pstr->offsets_needed = 0;
 592       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 593                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 594       if (!pstr->mbs_allocated)
 595         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 596       offset = idx;
 597     }
 598
 599   if (BE (offset != 0, 1))
 600     {
 601       /* Are the characters which are already checked remain?  */
 602       if (BE (offset < pstr->valid_raw_len, 1)
 603 #ifdef RE_ENABLE_I18N
 604           /* Handling this would enlarge the code too much.
 605              Accept a slowdown in that case.  */
 606           && pstr->offsets_needed == 0
 607 #endif
 608          )
 609         {
 610           /* Yes, move them to the front of the buffer.  */
 611           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 612 #ifdef RE_ENABLE_I18N
 613           if (pstr->mb_cur_max > 1)
 614             memmove (pstr->wcs, pstr->wcs + offset,
 615                      (pstr->valid_len - offset) * sizeof (wint_t));
 616 #endif /* RE_ENABLE_I18N */
 617           if (BE (pstr->mbs_allocated, 0))
 618             memmove (pstr->mbs, pstr->mbs + offset,
 619                      pstr->valid_len - offset);
 620           pstr->valid_len -= offset;
 621           pstr->valid_raw_len -= offset;
 622 #if DEBUG
 623           assert (pstr->valid_len > 0);
 624 #endif
 625         }
 626       else
 627         {
 628           /* No, skip all characters until IDX.  */
 629 #ifdef RE_ENABLE_I18N
 630           if (BE (pstr->offsets_needed, 0))
 631             {
 632               pstr->len = pstr->raw_len - idx + offset;
 633               pstr->stop = pstr->raw_stop - idx + offset;
 634               pstr->offsets_needed = 0;
 635             }
 636 #endif
 637           pstr->valid_len = 0;
 638           pstr->valid_raw_len = 0;
 639 #ifdef RE_ENABLE_I18N
 640           if (pstr->mb_cur_max > 1)
 641             {
 642               int wcs_idx;
 643               wint_t wc = WEOF;
 644
 645               if (pstr->is_utf8)
 646                 {
 647                   const unsigned char *raw, *p, *q, *end;
 648
 649                   /* Special case UTF-8.  Multi-byte chars start with any
 650                      byte other than 0x80 - 0xbf.  */
 651                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 652                   end = raw + (offset - pstr->mb_cur_max);
 653                   for (p = raw + offset - 1; p >= end; --p)
 654                     if ((*p & 0xc0) != 0x80)
 655                       {
 656                         mbstate_t cur_state;
 657                         wchar_t wc2;
 658                         int mlen = raw + pstr->len - p;
 659                         unsigned char buf[6];
 660
 661                         q = p;
 662                         if (BE (pstr->trans != NULL, 0))
 663                           {
 664                             int i = mlen < 6 ? mlen : 6;
 665                             while (--i >= 0)
 666                               buf[i] = pstr->trans[p[i]];
 667                             q = buf;
 668                           }
 669                         /* XXX Don't use mbrtowc, we know which conversion
 670                            to use (UTF-8 -> UCS4).  */
 671                         memset (&cur_state, 0, sizeof (cur_state));
 672                         mlen = mbrtowc (&wc2, p, mlen, &cur_state)
 673                                - (raw + offset - p);
 674                         if (mlen >= 0)
 675                           {
 676                             memset (&pstr->cur_state, '\0',
 677                                     sizeof (mbstate_t));
 678                             pstr->valid_len = mlen;
 679                             wc = wc2;
 680                           }
 681                         break;
 682                       }
 683                 }
 684
 685               if (wc == WEOF)
 686                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 687               if (BE (pstr->valid_len, 0))
 688                 {
 689                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 690                     pstr->wcs[wcs_idx] = WEOF;
 691                   if (pstr->mbs_allocated)
 692                     memset (pstr->mbs, 255, pstr->valid_len);
 693                 }
 694               pstr->valid_raw_len = pstr->valid_len;
 695               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 696                                     && IS_WIDE_WORD_CHAR (wc))
 697                                    ? CONTEXT_WORD
 698                                    : ((IS_WIDE_NEWLINE (wc)
 699                                        && pstr->newline_anchor)
 700                                       ? CONTEXT_NEWLINE : 0));
 701             }
 702           else
 703 #endif /* RE_ENABLE_I18N */
 704             {
 705               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 706               if (pstr->trans)
 707                 c = pstr->trans[c];
 708               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 709                                    ? CONTEXT_WORD
 710                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 711                                       ? CONTEXT_NEWLINE : 0));
 712             }
 713         }
 714       if (!BE (pstr->mbs_allocated, 0))
 715         pstr->mbs += offset;
 716     }
 717   pstr->raw_mbs_idx = idx;
 718   pstr->len -= offset;
 719   pstr->stop -= offset;
 720
 721   /* Then build the buffers.  */
 722 #ifdef RE_ENABLE_I18N
 723   if (pstr->mb_cur_max > 1)
 724     {
 725       if (pstr->icase)
 726         {
 727           int ret = build_wcs_upper_buffer (pstr);
 728           if (BE (ret != REG_NOERROR, 0))
 729             return ret;
 730         }
 731       else
 732         build_wcs_buffer (pstr);
 733     }
 734   else
 735 #endif /* RE_ENABLE_I18N */
 736   if (BE (pstr->mbs_allocated, 0))
 737     {
 738       if (pstr->icase)
 739         build_upper_buffer (pstr);
 740       else if (pstr->trans != NULL)
 741         re_string_translate_buffer (pstr);
 742     }
 743   else
 744     pstr->valid_len = pstr->len;
 745
 746   pstr->cur_idx = 0;
 747   return REG_NOERROR;
 748 }
 749
 750 static unsigned char
 751 re_string_peek_byte_case (pstr, idx)
 752      const re_string_t *pstr;
 753      int idx;
 754 {
 755   int ch, off;
 756
 757   /* Handle the common (easiest) cases first.  */
 758   if (BE (!pstr->mbs_allocated, 1))
 759     return re_string_peek_byte (pstr, idx);
 760
 761 #ifdef RE_ENABLE_I18N
 762   if (pstr->mb_cur_max > 1
 763       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 764     return re_string_peek_byte (pstr, idx);
 765 #endif
 766
 767   off = pstr->cur_idx + idx;
 768 #ifdef RE_ENABLE_I18N
 769   if (pstr->offsets_needed)
 770     off = pstr->offsets[off];
 771 #endif
 772
 773   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 774
 775 #ifdef RE_ENABLE_I18N
 776   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 777      this function returns CAPITAL LETTER I instead of first byte of
 778      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 779      since peek_byte_case doesn't advance cur_idx in any way.  */
 780   if (pstr->offsets_needed && !isascii (ch))
 781     return re_string_peek_byte (pstr, idx);
 782 #endif
 783
 784   return ch;
 785 }
 786
 787 static unsigned char
 788 re_string_fetch_byte_case (pstr)
 789      re_string_t *pstr;
 790 {
 791   if (BE (!pstr->mbs_allocated, 1))
 792     return re_string_fetch_byte (pstr);
 793
 794 #ifdef RE_ENABLE_I18N
 795   if (pstr->offsets_needed)
 796     {
 797       int off, ch;
 798
 799       /* For tr_TR.UTF-8 [[:islower:]] there is
 800          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 801          in that case the whole multi-byte character and return
 802          the original letter.  On the other side, with
 803          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 804          anything else would complicate things too much.  */
 805
 806       if (!re_string_first_byte (pstr, pstr->cur_idx))
 807         return re_string_fetch_byte (pstr);
 808
 809       off = pstr->offsets[pstr->cur_idx];
 810       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 811
 812       if (! isascii (ch))
 813         return re_string_fetch_byte (pstr);
 814
 815       re_string_skip_bytes (pstr,
 816                             re_string_char_size_at (pstr, pstr->cur_idx));
 817       return ch;
 818     }
 819 #endif
 820
 821   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 822 }
 823
 824 static void
 825 re_string_destruct (pstr)
 826      re_string_t *pstr;
 827 {
 828 #ifdef RE_ENABLE_I18N
 829   re_free (pstr->wcs);
 830   re_free (pstr->offsets);
 831 #endif /* RE_ENABLE_I18N  */
 832   if (pstr->mbs_allocated)
 833     re_free (pstr->mbs);
 834 }
 835
 836 /* Return the context at IDX in INPUT.  */
 837
 838 static unsigned int
 839 re_string_context_at (input, idx, eflags)
 840      const re_string_t *input;
 841      int idx, eflags;
 842 {
 843   int c;
 844   if (BE (idx < 0, 0))
 845     /* In this case, we use the value stored in input->tip_context,
 846        since we can't know the character in input->mbs[-1] here.  */
 847     return input->tip_context;
 848   if (BE (idx == input->len, 0))
 849     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 850             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 851 #ifdef RE_ENABLE_I18N
 852   if (input->mb_cur_max > 1)
 853     {
 854       wint_t wc;
 855       int wc_idx = idx;
 856       while(input->wcs[wc_idx] == WEOF)
 857         {
 858 #ifdef DEBUG
 859           /* It must not happen.  */
 860           assert (wc_idx >= 0);
 861 #endif
 862           --wc_idx;
 863           if (wc_idx < 0)
 864             return input->tip_context;
 865         }
 866       wc = input->wcs[wc_idx];
 867       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 868         return CONTEXT_WORD;
 869       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 870               ? CONTEXT_NEWLINE : 0);
 871     }
 872   else
 873 #endif
 874     {
 875       c = re_string_byte_at (input, idx);
 876       if (bitset_contain (input->word_char, c))
 877         return CONTEXT_WORD;
 878       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 879     }
 880 }
 881 \f
 882 /* Functions for set operation.  */
 883
 884 static reg_errcode_t
 885 re_node_set_alloc (set, size)
 886      re_node_set *set;
 887      int size;
 888 {
 889   set->alloc = size;
 890   set->nelem = 0;
 891   set->elems = re_malloc (int, size);
 892   if (BE (set->elems == NULL, 0))
 893     return REG_ESPACE;
 894   return REG_NOERROR;
 895 }
 896
 897 static reg_errcode_t
 898 re_node_set_init_1 (set, elem)
 899      re_node_set *set;
 900      int elem;
 901 {
 902   set->alloc = 1;
 903   set->nelem = 1;
 904   set->elems = re_malloc (int, 1);
 905   if (BE (set->elems == NULL, 0))
 906     {
 907       set->alloc = set->nelem = 0;
 908       return REG_ESPACE;
 909     }
 910   set->elems[0] = elem;
 911   return REG_NOERROR;
 912 }
 913
 914 static reg_errcode_t
 915 re_node_set_init_2 (set, elem1, elem2)
 916      re_node_set *set;
 917      int elem1, elem2;
 918 {
 919   set->alloc = 2;
 920   set->elems = re_malloc (int, 2);
 921   if (BE (set->elems == NULL, 0))
 922     return REG_ESPACE;
 923   if (elem1 == elem2)
 924     {
 925       set->nelem = 1;
 926       set->elems[0] = elem1;
 927     }
 928   else
 929     {
 930       set->nelem = 2;
 931       if (elem1 < elem2)
 932         {
 933           set->elems[0] = elem1;
 934           set->elems[1] = elem2;
 935         }
 936       else
 937         {
 938           set->elems[0] = elem2;
 939           set->elems[1] = elem1;
 940         }
 941     }
 942   return REG_NOERROR;
 943 }
 944
 945 static reg_errcode_t
 946 re_node_set_init_copy (dest, src)
 947      re_node_set *dest;
 948      const re_node_set *src;
 949 {
 950   dest->nelem = src->nelem;
 951   if (src->nelem > 0)
 952     {
 953       dest->alloc = dest->nelem;
 954       dest->elems = re_malloc (int, dest->alloc);
 955       if (BE (dest->elems == NULL, 0))
 956         {
 957           dest->alloc = dest->nelem = 0;
 958           return REG_ESPACE;
 959         }
 960       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
 961     }
 962   else
 963     re_node_set_init_empty (dest);
 964   return REG_NOERROR;
 965 }
 966
 967 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 968    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 969    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 970
 971 static reg_errcode_t
 972 re_node_set_add_intersect (dest, src1, src2)
 973      re_node_set *dest;
 974      const re_node_set *src1, *src2;
 975 {
 976   int i1, i2, is, id, delta, sbase;
 977   if (src1->nelem == 0 || src2->nelem == 0)
 978     return REG_NOERROR;
 979
 980   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 981      conservative estimate.  */
 982   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 983     {
 984       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
 985       int *new_elems = re_realloc (dest->elems, int, new_alloc);
 986       if (BE (new_elems == NULL, 0))
 987         return REG_ESPACE;
 988       dest->elems = new_elems;
 989       dest->alloc = new_alloc;
 990     }
 991
 992   /* Find the items in the intersection of SRC1 and SRC2, and copy
 993      into the top of DEST those that are not already in DEST itself.  */
 994   sbase = dest->nelem + src1->nelem + src2->nelem;
 995   i1 = src1->nelem - 1;
 996   i2 = src2->nelem - 1;
 997   id = dest->nelem - 1;
 998   for (;;)
 999     {
1000       if (src1->elems[i1] == src2->elems[i2])
1001         {
1002           /* Try to find the item in DEST.  Maybe we could binary search?  */
1003           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1004             --id;
1005
1006           if (id < 0 || dest->elems[id] != src1->elems[i1])
1007             dest->elems[--sbase] = src1->elems[i1];
1008
1009           if (--i1 < 0 || --i2 < 0)
1010             break;
1011         }
1012
1013       /* Lower the highest of the two items.  */
1014       else if (src1->elems[i1] < src2->elems[i2])
1015         {
1016           if (--i2 < 0)
1017             break;
1018         }
1019       else
1020         {
1021           if (--i1 < 0)
1022             break;
1023         }
1024     }
1025
1026   id = dest->nelem - 1;
1027   is = dest->nelem + src1->nelem + src2->nelem - 1;
1028   delta = is - sbase + 1;
1029
1030   /* Now copy.  When DELTA becomes zero, the remaining
1031      DEST elements are already in place; this is more or
1032      less the same loop that is in re_node_set_merge.  */
1033   dest->nelem += delta;
1034   if (delta > 0 && id >= 0)
1035     for (;;)
1036       {
1037         if (dest->elems[is] > dest->elems[id])
1038           {
1039             /* Copy from the top.  */
1040             dest->elems[id + delta--] = dest->elems[is--];
1041             if (delta == 0)
1042               break;
1043           }
1044         else
1045           {
1046             /* Slide from the bottom.  */
1047             dest->elems[id + delta] = dest->elems[id];
1048             if (--id < 0)
1049               break;
1050           }
1051       }
1052
1053   /* Copy remaining SRC elements.  */
1054   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1055
1056   return REG_NOERROR;
1057 }
1058
1059 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1060    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1061
1062 static reg_errcode_t
1063 re_node_set_init_union (dest, src1, src2)
1064      re_node_set *dest;
1065      const re_node_set *src1, *src2;
1066 {
1067   int i1, i2, id;
1068   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1069     {
1070       dest->alloc = src1->nelem + src2->nelem;
1071       dest->elems = re_malloc (int, dest->alloc);
1072       if (BE (dest->elems == NULL, 0))
1073         return REG_ESPACE;
1074     }
1075   else
1076     {
1077       if (src1 != NULL && src1->nelem > 0)
1078         return re_node_set_init_copy (dest, src1);
1079       else if (src2 != NULL && src2->nelem > 0)
1080         return re_node_set_init_copy (dest, src2);
1081       else
1082         re_node_set_init_empty (dest);
1083       return REG_NOERROR;
1084     }
1085   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1086     {
1087       if (src1->elems[i1] > src2->elems[i2])
1088         {
1089           dest->elems[id++] = src2->elems[i2++];
1090           continue;
1091         }
1092       if (src1->elems[i1] == src2->elems[i2])
1093         ++i2;
1094       dest->elems[id++] = src1->elems[i1++];
1095     }
1096   if (i1 < src1->nelem)
1097     {
1098       memcpy (dest->elems + id, src1->elems + i1,
1099              (src1->nelem - i1) * sizeof (int));
1100       id += src1->nelem - i1;
1101     }
1102   else if (i2 < src2->nelem)
1103     {
1104       memcpy (dest->elems + id, src2->elems + i2,
1105              (src2->nelem - i2) * sizeof (int));
1106       id += src2->nelem - i2;
1107     }
1108   dest->nelem = id;
1109   return REG_NOERROR;
1110 }
1111
1112 /* Calculate the union set of the sets DEST and SRC. And store it to
1113    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1114
1115 static reg_errcode_t
1116 re_node_set_merge (dest, src)
1117      re_node_set *dest;
1118      const re_node_set *src;
1119 {
1120   int is, id, sbase, delta;
1121   if (src == NULL || src->nelem == 0)
1122     return REG_NOERROR;
1123   if (dest->alloc < 2 * src->nelem + dest->nelem)
1124     {
1125       int new_alloc = 2 * (src->nelem + dest->alloc);
1126       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1127       if (BE (new_buffer == NULL, 0))
1128         return REG_ESPACE;
1129       dest->elems = new_buffer;
1130       dest->alloc = new_alloc;
1131     }
1132
1133   if (BE (dest->nelem == 0, 0))
1134     {
1135       dest->nelem = src->nelem;
1136       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1137       return REG_NOERROR;
1138     }
1139
1140   /* Copy into the top of DEST the items of SRC that are not
1141      found in DEST.  Maybe we could binary search in DEST?  */
1142   for (sbase = dest->nelem + 2 * src->nelem,
1143        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1144     {
1145       if (dest->elems[id] == src->elems[is])
1146         is--, id--;
1147       else if (dest->elems[id] < src->elems[is])
1148         dest->elems[--sbase] = src->elems[is--];
1149       else /* if (dest->elems[id] > src->elems[is]) */
1150         --id;
1151     }
1152
1153   if (is >= 0)
1154     {
1155       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1156       sbase -= is + 1;
1157       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1158     }
1159
1160   id = dest->nelem - 1;
1161   is = dest->nelem + 2 * src->nelem - 1;
1162   delta = is - sbase + 1;
1163   if (delta == 0)
1164     return REG_NOERROR;
1165
1166   /* Now copy.  When DELTA becomes zero, the remaining
1167      DEST elements are already in place.  */
1168   dest->nelem += delta;
1169   for (;;)
1170     {
1171       if (dest->elems[is] > dest->elems[id])
1172         {
1173           /* Copy from the top.  */
1174           dest->elems[id + delta--] = dest->elems[is--];
1175           if (delta == 0)
1176             break;
1177         }
1178       else
1179         {
1180           /* Slide from the bottom.  */
1181           dest->elems[id + delta] = dest->elems[id];
1182           if (--id < 0)
1183             {
1184               /* Copy remaining SRC elements.  */
1185               memcpy (dest->elems, dest->elems + sbase,
1186                       delta * sizeof (int));
1187               break;
1188             }
1189         }
1190     }
1191
1192   return REG_NOERROR;
1193 }
1194
1195 /* Insert the new element ELEM to the re_node_set* SET.
1196    SET should not already have ELEM.
1197    return -1 if an error is occured, return 1 otherwise.  */
1198
1199 static int
1200 re_node_set_insert (set, elem)
1201      re_node_set *set;
1202      int elem;
1203 {
1204   int idx;
1205   /* In case the set is empty.  */
1206   if (set->alloc == 0)
1207     {
1208       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1209         return 1;
1210       else
1211         return -1;
1212     }
1213
1214   if (BE (set->nelem, 0) == 0)
1215     {
1216       /* We already guaranteed above that set->alloc != 0.  */
1217       set->elems[0] = elem;
1218       ++set->nelem;
1219       return 1;
1220     }
1221
1222   /* Realloc if we need.  */
1223   if (set->alloc == set->nelem)
1224     {
1225       int *new_array;
1226       set->alloc = set->alloc * 2;
1227       new_array = re_realloc (set->elems, int, set->alloc);
1228       if (BE (new_array == NULL, 0))
1229         return -1;
1230       set->elems = new_array;
1231     }
1232
1233   /* Move the elements which follows the new element.  Test the
1234      first element separately to skip a check in the inner loop.  */
1235   if (elem < set->elems[0])
1236     {
1237       idx = 0;
1238       for (idx = set->nelem; idx > 0; idx--)
1239         set->elems[idx] = set->elems[idx - 1];
1240     }
1241   else
1242     {
1243       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1244         set->elems[idx] = set->elems[idx - 1];
1245     }
1246
1247   /* Insert the new element.  */
1248   set->elems[idx] = elem;
1249   ++set->nelem;
1250   return 1;
1251 }
1252
1253 /* Insert the new element ELEM to the re_node_set* SET.
1254    SET should not already have any element greater than or equal to ELEM.
1255    Return -1 if an error is occured, return 1 otherwise.  */
1256
1257 static int
1258 re_node_set_insert_last (set, elem)
1259      re_node_set *set;
1260      int elem;
1261 {
1262   /* Realloc if we need.  */
1263   if (set->alloc == set->nelem)
1264     {
1265       int *new_array;
1266       set->alloc = (set->alloc + 1) * 2;
1267       new_array = re_realloc (set->elems, int, set->alloc);
1268       if (BE (new_array == NULL, 0))
1269         return -1;
1270       set->elems = new_array;
1271     }
1272
1273   /* Insert the new element.  */
1274   set->elems[set->nelem++] = elem;
1275   return 1;
1276 }
1277
1278 /* Compare two node sets SET1 and SET2.
1279    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1280
1281 static int
1282 re_node_set_compare (set1, set2)
1283      const re_node_set *set1, *set2;
1284 {
1285   int i;
1286   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1287     return 0;
1288   for (i = set1->nelem ; --i >= 0 ; )
1289     if (set1->elems[i] != set2->elems[i])
1290       return 0;
1291   return 1;
1292 }
1293
1294 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1295
1296 static int
1297 re_node_set_contains (set, elem)
1298      const re_node_set *set;
1299      int elem;
1300 {
1301   int idx, right, mid;
1302   if (set->nelem <= 0)
1303     return 0;
1304
1305   /* Binary search the element.  */
1306   idx = 0;
1307   right = set->nelem - 1;
1308   while (idx < right)
1309     {
1310       mid = (idx + right) / 2;
1311       if (set->elems[mid] < elem)
1312         idx = mid + 1;
1313       else
1314         right = mid;
1315     }
1316   return set->elems[idx] == elem ? idx + 1 : 0;
1317 }
1318
1319 static void
1320 re_node_set_remove_at (set, idx)
1321      re_node_set *set;
1322      int idx;
1323 {
1324   if (idx < 0 || idx >= set->nelem)
1325     return;
1326   --set->nelem;
1327   for (; idx < set->nelem; idx++)
1328     set->elems[idx] = set->elems[idx + 1];
1329 }
1330 \f
1331
1332 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1333    Or return -1, if an error will be occured.  */
1334
1335 static int
1336 re_dfa_add_node (dfa, token, mode)
1337      re_dfa_t *dfa;
1338      re_token_t token;
1339      int mode;
1340 {
1341   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1342     {
1343       int new_nodes_alloc = dfa->nodes_alloc * 2;
1344       re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1345                                           new_nodes_alloc);
1346       if (BE (new_array == NULL, 0))
1347         return -1;
1348       dfa->nodes = new_array;
1349       if (mode)
1350         {
1351           int *new_nexts, *new_indices;
1352           re_node_set *new_edests, *new_eclosures, *new_inveclosures;
1353
1354           new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1355           new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1356           new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1357           new_eclosures = re_realloc (dfa->eclosures, re_node_set,
1358                                       new_nodes_alloc);
1359           new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
1360                                          new_nodes_alloc);
1361           if (BE (new_nexts == NULL || new_indices == NULL
1362                   || new_edests == NULL || new_eclosures == NULL
1363                   || new_inveclosures == NULL, 0))
1364             return -1;
1365           dfa->nexts = new_nexts;
1366           dfa->org_indices = new_indices;
1367           dfa->edests = new_edests;
1368           dfa->eclosures = new_eclosures;
1369           dfa->inveclosures = new_inveclosures;
1370         }
1371       dfa->nodes_alloc = new_nodes_alloc;
1372     }
1373   dfa->nodes[dfa->nodes_len] = token;
1374   dfa->nodes[dfa->nodes_len].opt_subexp = 0;
1375   dfa->nodes[dfa->nodes_len].duplicated = 0;
1376   dfa->nodes[dfa->nodes_len].constraint = 0;
1377   return dfa->nodes_len++;
1378 }
1379
1380 static unsigned int inline
1381 calc_state_hash (nodes, context)
1382      const re_node_set *nodes;
1383      unsigned int context;
1384 {
1385   unsigned int hash = nodes->nelem + context;
1386   int i;
1387   for (i = 0 ; i < nodes->nelem ; i++)
1388     hash += nodes->elems[i];
1389   return hash;
1390 }
1391
1392 /* Search for the state whose node_set is equivalent to NODES.
1393    Return the pointer to the state, if we found it in the DFA.
1394    Otherwise create the new one and return it.  In case of an error
1395    return NULL and set the error code in ERR.
1396    Note: - We assume NULL as the invalid state, then it is possible that
1397            return value is NULL and ERR is REG_NOERROR.
1398          - We never return non-NULL value in case of any errors, it is for
1399            optimization.  */
1400
1401 static re_dfastate_t*
1402 re_acquire_state (err, dfa, nodes)
1403      reg_errcode_t *err;
1404      re_dfa_t *dfa;
1405      const re_node_set *nodes;
1406 {
1407   unsigned int hash;
1408   re_dfastate_t *new_state;
1409   struct re_state_table_entry *spot;
1410   int i;
1411   if (BE (nodes->nelem == 0, 0))
1412     {
1413       *err = REG_NOERROR;
1414       return NULL;
1415     }
1416   hash = calc_state_hash (nodes, 0);
1417   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1418
1419   for (i = 0 ; i < spot->num ; i++)
1420     {
1421       re_dfastate_t *state = spot->array[i];
1422       if (hash != state->hash)
1423         continue;
1424       if (re_node_set_compare (&state->nodes, nodes))
1425         return state;
1426     }
1427
1428   /* There are no appropriate state in the dfa, create the new one.  */
1429   new_state = create_ci_newstate (dfa, nodes, hash);
1430   if (BE (new_state != NULL, 1))
1431     return new_state;
1432   else
1433     {
1434       *err = REG_ESPACE;
1435       return NULL;
1436     }
1437 }
1438
1439 /* Search for the state whose node_set is equivalent to NODES and
1440    whose context is equivalent to CONTEXT.
1441    Return the pointer to the state, if we found it in the DFA.
1442    Otherwise create the new one and return it.  In case of an error
1443    return NULL and set the error code in ERR.
1444    Note: - We assume NULL as the invalid state, then it is possible that
1445            return value is NULL and ERR is REG_NOERROR.
1446          - We never return non-NULL value in case of any errors, it is for
1447            optimization.  */
1448
1449 static re_dfastate_t*
1450 re_acquire_state_context (err, dfa, nodes, context)
1451      reg_errcode_t *err;
1452      re_dfa_t *dfa;
1453      const re_node_set *nodes;
1454      unsigned int context;
1455 {
1456   unsigned int hash;
1457   re_dfastate_t *new_state;
1458   struct re_state_table_entry *spot;
1459   int i;
1460   if (nodes->nelem == 0)
1461     {
1462       *err = REG_NOERROR;
1463       return NULL;
1464     }
1465   hash = calc_state_hash (nodes, context);
1466   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1467
1468   for (i = 0 ; i < spot->num ; i++)
1469     {
1470       re_dfastate_t *state = spot->array[i];
1471       if (state->hash == hash
1472           && state->context == context
1473           && re_node_set_compare (state->entrance_nodes, nodes))
1474         return state;
1475     }
1476   /* There are no appropriate state in `dfa', create the new one.  */
1477   new_state = create_cd_newstate (dfa, nodes, context, hash);
1478   if (BE (new_state != NULL, 1))
1479     return new_state;
1480   else
1481     {
1482       *err = REG_ESPACE;
1483       return NULL;
1484     }
1485 }
1486
1487 /* Allocate memory for DFA state and initialize common properties.
1488    Return the new state if succeeded, otherwise return NULL.  */
1489
1490 static re_dfastate_t *
1491 create_newstate_common (dfa, nodes, hash)
1492      re_dfa_t *dfa;
1493      const re_node_set *nodes;
1494      unsigned int hash;
1495 {
1496   re_dfastate_t *newstate;
1497   reg_errcode_t err;
1498   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1499   if (BE (newstate == NULL, 0))
1500     return NULL;
1501   err = re_node_set_init_copy (&newstate->nodes, nodes);
1502   if (BE (err != REG_NOERROR, 0))
1503     {
1504       re_free (newstate);
1505       return NULL;
1506     }
1507   newstate->trtable = NULL;
1508   newstate->hash = hash;
1509   return newstate;
1510 }
1511
1512 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1513    position.  Return value indicate the error code if failed.  */
1514
1515 static reg_errcode_t
1516 register_state (dfa, newstate, hash)
1517      re_dfa_t *dfa;
1518      re_dfastate_t *newstate;
1519      unsigned int hash;
1520 {
1521   struct re_state_table_entry *spot;
1522   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1523
1524   if (BE (spot->alloc <= spot->num, 0))
1525     {
1526       int new_alloc = 2 * spot->num + 2;
1527       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1528                                               new_alloc);
1529       if (BE (new_array == NULL, 0))
1530         return REG_ESPACE;
1531       spot->array = new_array;
1532       spot->alloc = new_alloc;
1533     }
1534   spot->array[spot->num++] = newstate;
1535   return REG_NOERROR;
1536 }
1537
1538 /* Create the new state which is independ of contexts.
1539    Return the new state if succeeded, otherwise return NULL.  */
1540
1541 static re_dfastate_t *
1542 create_ci_newstate (dfa, nodes, hash)
1543      re_dfa_t *dfa;
1544      const re_node_set *nodes;
1545      unsigned int hash;
1546 {
1547   int i;
1548   reg_errcode_t err;
1549   re_dfastate_t *newstate;
1550   newstate = create_newstate_common (dfa, nodes, hash);
1551   if (BE (newstate == NULL, 0))
1552     return NULL;
1553   newstate->entrance_nodes = &newstate->nodes;
1554
1555   for (i = 0 ; i < nodes->nelem ; i++)
1556     {
1557       re_token_t *node = dfa->nodes + nodes->elems[i];
1558       re_token_type_t type = node->type;
1559       if (type == CHARACTER && !node->constraint)
1560         continue;
1561
1562       /* If the state has the halt node, the state is a halt state.  */
1563       else if (type == END_OF_RE)
1564         newstate->halt = 1;
1565 #ifdef RE_ENABLE_I18N
1566       else if (type == COMPLEX_BRACKET
1567                || type == OP_UTF8_PERIOD
1568                || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1569         newstate->accept_mb = 1;
1570 #endif /* RE_ENABLE_I18N */
1571       else if (type == OP_BACK_REF)
1572         newstate->has_backref = 1;
1573       else if (type == ANCHOR || node->constraint)
1574         newstate->has_constraint = 1;
1575     }
1576   err = register_state (dfa, newstate, hash);
1577   if (BE (err != REG_NOERROR, 0))
1578     {
1579       free_state (newstate);
1580       newstate = NULL;
1581     }
1582   return newstate;
1583 }
1584
1585 /* Create the new state which is depend on the context CONTEXT.
1586    Return the new state if succeeded, otherwise return NULL.  */
1587
1588 static re_dfastate_t *
1589 create_cd_newstate (dfa, nodes, context, hash)
1590      re_dfa_t *dfa;
1591      const re_node_set *nodes;
1592      unsigned int context, hash;
1593 {
1594   int i, nctx_nodes = 0;
1595   reg_errcode_t err;
1596   re_dfastate_t *newstate;
1597
1598   newstate = create_newstate_common (dfa, nodes, hash);
1599   if (BE (newstate == NULL, 0))
1600     return NULL;
1601   newstate->context = context;
1602   newstate->entrance_nodes = &newstate->nodes;
1603
1604   for (i = 0 ; i < nodes->nelem ; i++)
1605     {
1606       unsigned int constraint = 0;
1607       re_token_t *node = dfa->nodes + nodes->elems[i];
1608       re_token_type_t type = node->type;
1609       if (node->constraint)
1610         constraint = node->constraint;
1611
1612       if (type == CHARACTER && !constraint)
1613         continue;
1614       /* If the state has the halt node, the state is a halt state.  */
1615       else if (type == END_OF_RE)
1616         newstate->halt = 1;
1617 #ifdef RE_ENABLE_I18N
1618       else if (type == COMPLEX_BRACKET
1619                || type == OP_UTF8_PERIOD
1620                || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1621         newstate->accept_mb = 1;
1622 #endif /* RE_ENABLE_I18N */
1623       else if (type == OP_BACK_REF)
1624         newstate->has_backref = 1;
1625       else if (type == ANCHOR)
1626         constraint = node->opr.ctx_type;
1627
1628       if (constraint)
1629         {
1630           if (newstate->entrance_nodes == &newstate->nodes)
1631             {
1632               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1633               if (BE (newstate->entrance_nodes == NULL, 0))
1634                 {
1635                   free_state (newstate);
1636                   return NULL;
1637                 }
1638               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1639               nctx_nodes = 0;
1640               newstate->has_constraint = 1;
1641             }
1642
1643           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1644             {
1645               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1646               ++nctx_nodes;
1647             }
1648         }
1649     }
1650   err = register_state (dfa, newstate, hash);
1651   if (BE (err != REG_NOERROR, 0))
1652     {
1653       free_state (newstate);
1654       newstate = NULL;
1655     }
1656   return  newstate;
1657 }
1658
1659 static void
1660 free_state (state)
1661      re_dfastate_t *state;
1662 {
1663   if (state->entrance_nodes != &state->nodes)
1664     {
1665       re_node_set_free (state->entrance_nodes);
1666       re_free (state->entrance_nodes);
1667     }
1668   re_node_set_free (&state->nodes);
1669   re_free (state->trtable);
1670   re_free (state);
1671 }