posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static void re_string_construct_common (const char *str, int len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, int icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  26                                           const re_node_set *nodes,
  27                                           unsigned int hash) internal_function;
  28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  29                                           const re_node_set *nodes,
  30                                           unsigned int context,
  31                                           unsigned int hash) internal_function;
  32 \f
  33 /* Functions for string operation.  */
  34
  35 /* This function allocate the buffers.  It is necessary to call
  36    re_string_reconstruct before using the object.  */
  37
  38 static reg_errcode_t
  39 internal_function
  40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  41                     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  42 {
  43   reg_errcode_t ret;
  44   int init_buf_len;
  45
  46   /* Ensure at least one character fits into the buffers.  */
  47   if (init_len < dfa->mb_cur_max)
  48     init_len = dfa->mb_cur_max;
  49   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  50   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  51
  52   ret = re_string_realloc_buffers (pstr, init_buf_len);
  53   if (BE (ret != REG_NOERROR, 0))
  54     return ret;
  55
  56   pstr->word_char = dfa->word_char;
  57   pstr->word_ops_used = dfa->word_ops_used;
  58   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  59   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  60   pstr->valid_raw_len = pstr->valid_len;
  61   return REG_NOERROR;
  62 }
  63
  64 /* This function allocate the buffers, and initialize them.  */
  65
  66 static reg_errcode_t
  67 internal_function
  68 re_string_construct (re_string_t *pstr, const char *str, int len,
  69                      RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  70 {
  71   reg_errcode_t ret;
  72   memset (pstr, '\0', sizeof (re_string_t));
  73   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  74
  75   if (len > 0)
  76     {
  77       ret = re_string_realloc_buffers (pstr, len + 1);
  78       if (BE (ret != REG_NOERROR, 0))
  79         return ret;
  80     }
  81   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  82
  83   if (icase)
  84     {
  85 #ifdef RE_ENABLE_I18N
  86       if (dfa->mb_cur_max > 1)
  87         {
  88           while (1)
  89             {
  90               ret = build_wcs_upper_buffer (pstr);
  91               if (BE (ret != REG_NOERROR, 0))
  92                 return ret;
  93               if (pstr->valid_raw_len >= len)
  94                 break;
  95               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  96                 break;
  97               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  98               if (BE (ret != REG_NOERROR, 0))
  99                 return ret;
 100             }
 101         }
 102       else
 103 #endif /* RE_ENABLE_I18N  */
 104         build_upper_buffer (pstr);
 105     }
 106   else
 107     {
 108 #ifdef RE_ENABLE_I18N
 109       if (dfa->mb_cur_max > 1)
 110         build_wcs_buffer (pstr);
 111       else
 112 #endif /* RE_ENABLE_I18N  */
 113         {
 114           if (trans != NULL)
 115             re_string_translate_buffer (pstr);
 116           else
 117             {
 118               pstr->valid_len = pstr->bufs_len;
 119               pstr->valid_raw_len = pstr->bufs_len;
 120             }
 121         }
 122     }
 123
 124   return REG_NOERROR;
 125 }
 126
 127 /* Helper functions for re_string_allocate, and re_string_construct.  */
 128
 129 static reg_errcode_t
 130 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
 131 {
 132 #ifdef RE_ENABLE_I18N
 133   if (pstr->mb_cur_max > 1)
 134     {
 135       wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 136       if (BE (new_wcs == NULL, 0))
 137         return REG_ESPACE;
 138       pstr->wcs = new_wcs;
 139       if (pstr->offsets != NULL)
 140         {
 141           int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
 142           if (BE (new_offsets == NULL, 0))
 143             return REG_ESPACE;
 144           pstr->offsets = new_offsets;
 145         }
 146     }
 147 #endif /* RE_ENABLE_I18N  */
 148   if (pstr->mbs_allocated)
 149     {
 150       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 151                                            new_buf_len);
 152       if (BE (new_mbs == NULL, 0))
 153         return REG_ESPACE;
 154       pstr->mbs = new_mbs;
 155     }
 156   pstr->bufs_len = new_buf_len;
 157   return REG_NOERROR;
 158 }
 159
 160
 161 static void
 162 re_string_construct_common (const char *str, int len, re_string_t *pstr,
 163                             RE_TRANSLATE_TYPE trans, int icase,
 164                             const re_dfa_t *dfa)
 165 {
 166   pstr->raw_mbs = (const unsigned char *) str;
 167   pstr->len = len;
 168   pstr->raw_len = len;
 169   pstr->trans = trans;
 170   pstr->icase = icase ? 1 : 0;
 171   pstr->mbs_allocated = (trans != NULL || icase);
 172   pstr->mb_cur_max = dfa->mb_cur_max;
 173   pstr->is_utf8 = dfa->is_utf8;
 174   pstr->map_notascii = dfa->map_notascii;
 175   pstr->stop = pstr->len;
 176   pstr->raw_stop = pstr->stop;
 177 }
 178
 179 #ifdef RE_ENABLE_I18N
 180
 181 /* Build wide character buffer PSTR->WCS.
 182    If the byte sequence of the string are:
 183      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 184    Then wide character buffer will be:
 185      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 186    We use WEOF for padding, they indicate that the position isn't
 187    a first byte of a multibyte character.
 188
 189    Note that this function assumes PSTR->VALID_LEN elements are already
 190    built and starts from PSTR->VALID_LEN.  */
 191
 192 static void
 193 build_wcs_buffer (re_string_t *pstr)
 194 {
 195 #ifdef _LIBC
 196   unsigned char buf[MB_LEN_MAX];
 197   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 198 #else
 199   unsigned char buf[64];
 200 #endif
 201   mbstate_t prev_st;
 202   int byte_idx, end_idx, remain_len;
 203   size_t mbclen;
 204
 205   /* Build the buffers from pstr->valid_len to either pstr->len or
 206      pstr->bufs_len.  */
 207   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 208   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 209     {
 210       wchar_t wc;
 211       const char *p;
 212
 213       remain_len = end_idx - byte_idx;
 214       prev_st = pstr->cur_state;
 215       /* Apply the translation if we need.  */
 216       if (BE (pstr->trans != NULL, 0))
 217         {
 218           int i, ch;
 219
 220           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 221             {
 222               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 223               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 224             }
 225           p = (const char *) buf;
 226         }
 227       else
 228         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 229       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 230       if (BE (mbclen == (size_t) -2, 0))
 231         {
 232           /* The buffer doesn't have enough space, finish to build.  */
 233           pstr->cur_state = prev_st;
 234           break;
 235         }
 236       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 237         {
 238           /* We treat these cases as a singlebyte character.  */
 239           mbclen = 1;
 240           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 241           if (BE (pstr->trans != NULL, 0))
 242             wc = pstr->trans[wc];
 243           pstr->cur_state = prev_st;
 244         }
 245
 246       /* Write wide character and padding.  */
 247       pstr->wcs[byte_idx++] = wc;
 248       /* Write paddings.  */
 249       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 250         pstr->wcs[byte_idx++] = WEOF;
 251     }
 252   pstr->valid_len = byte_idx;
 253   pstr->valid_raw_len = byte_idx;
 254 }
 255
 256 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 257    but for REG_ICASE.  */
 258
 259 static int
 260 build_wcs_upper_buffer (re_string_t *pstr)
 261 {
 262   mbstate_t prev_st;
 263   int src_idx, byte_idx, end_idx, remain_len;
 264   size_t mbclen;
 265 #ifdef _LIBC
 266   char buf[MB_LEN_MAX];
 267   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 268 #else
 269   char buf[64];
 270 #endif
 271
 272   byte_idx = pstr->valid_len;
 273   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 274
 275   /* The following optimization assumes that ASCII characters can be
 276      mapped to wide characters with a simple cast.  */
 277   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 278     {
 279       while (byte_idx < end_idx)
 280         {
 281           wchar_t wc;
 282
 283           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 284               && mbsinit (&pstr->cur_state))
 285             {
 286               /* In case of a singlebyte character.  */
 287               pstr->mbs[byte_idx]
 288                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 289               /* The next step uses the assumption that wchar_t is encoded
 290                  ASCII-safe: all ASCII values can be converted like this.  */
 291               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 292               ++byte_idx;
 293               continue;
 294             }
 295
 296           remain_len = end_idx - byte_idx;
 297           prev_st = pstr->cur_state;
 298           mbclen = mbrtowc (&wc,
 299                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 300                              + byte_idx), remain_len, &pstr->cur_state);
 301           if (BE (mbclen + 2 > 2, 1))
 302             {
 303               wchar_t wcu = wc;
 304               if (iswlower (wc))
 305                 {
 306                   size_t mbcdlen;
 307
 308                   wcu = towupper (wc);
 309                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 310                   if (BE (mbclen == mbcdlen, 1))
 311                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 312                   else
 313                     {
 314                       src_idx = byte_idx;
 315                       goto offsets_needed;
 316                     }
 317                 }
 318               else
 319                 memcpy (pstr->mbs + byte_idx,
 320                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 321               pstr->wcs[byte_idx++] = wcu;
 322               /* Write paddings.  */
 323               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 324                 pstr->wcs[byte_idx++] = WEOF;
 325             }
 326           else if (mbclen == (size_t) -1 || mbclen == 0)
 327             {
 328               /* It is an invalid character or '\0'.  Just use the byte.  */
 329               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 330               pstr->mbs[byte_idx] = ch;
 331               /* And also cast it to wide char.  */
 332               pstr->wcs[byte_idx++] = (wchar_t) ch;
 333               if (BE (mbclen == (size_t) -1, 0))
 334                 pstr->cur_state = prev_st;
 335             }
 336           else
 337             {
 338               /* The buffer doesn't have enough space, finish to build.  */
 339               pstr->cur_state = prev_st;
 340               break;
 341             }
 342         }
 343       pstr->valid_len = byte_idx;
 344       pstr->valid_raw_len = byte_idx;
 345       return REG_NOERROR;
 346     }
 347   else
 348     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 349       {
 350         wchar_t wc;
 351         const char *p;
 352       offsets_needed:
 353         remain_len = end_idx - byte_idx;
 354         prev_st = pstr->cur_state;
 355         if (BE (pstr->trans != NULL, 0))
 356           {
 357             int i, ch;
 358
 359             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 360               {
 361                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 362                 buf[i] = pstr->trans[ch];
 363               }
 364             p = (const char *) buf;
 365           }
 366         else
 367           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 368         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 369         if (BE (mbclen + 2 > 2, 1))
 370           {
 371             wchar_t wcu = wc;
 372             if (iswlower (wc))
 373               {
 374                 size_t mbcdlen;
 375
 376                 wcu = towupper (wc);
 377                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 378                 if (BE (mbclen == mbcdlen, 1))
 379                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 380                 else if (mbcdlen != (size_t) -1)
 381                   {
 382                     size_t i;
 383
 384                     if (byte_idx + mbcdlen > pstr->bufs_len)
 385                       {
 386                         pstr->cur_state = prev_st;
 387                         break;
 388                       }
 389
 390                     if (pstr->offsets == NULL)
 391                       {
 392                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 393
 394                         if (pstr->offsets == NULL)
 395                           return REG_ESPACE;
 396                       }
 397                     if (!pstr->offsets_needed)
 398                       {
 399                         for (i = 0; i < (size_t) byte_idx; ++i)
 400                           pstr->offsets[i] = i;
 401                         pstr->offsets_needed = 1;
 402                       }
 403
 404                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 405                     pstr->wcs[byte_idx] = wcu;
 406                     pstr->offsets[byte_idx] = src_idx;
 407                     for (i = 1; i < mbcdlen; ++i)
 408                       {
 409                         pstr->offsets[byte_idx + i]
 410                           = src_idx + (i < mbclen ? i : mbclen - 1);
 411                         pstr->wcs[byte_idx + i] = WEOF;
 412                       }
 413                     pstr->len += mbcdlen - mbclen;
 414                     if (pstr->raw_stop > src_idx)
 415                       pstr->stop += mbcdlen - mbclen;
 416                     end_idx = (pstr->bufs_len > pstr->len)
 417                               ? pstr->len : pstr->bufs_len;
 418                     byte_idx += mbcdlen;
 419                     src_idx += mbclen;
 420                     continue;
 421                   }
 422                 else
 423                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 424               }
 425             else
 426               memcpy (pstr->mbs + byte_idx, p, mbclen);
 427
 428             if (BE (pstr->offsets_needed != 0, 0))
 429               {
 430                 size_t i;
 431                 for (i = 0; i < mbclen; ++i)
 432                   pstr->offsets[byte_idx + i] = src_idx + i;
 433               }
 434             src_idx += mbclen;
 435
 436             pstr->wcs[byte_idx++] = wcu;
 437             /* Write paddings.  */
 438             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 439               pstr->wcs[byte_idx++] = WEOF;
 440           }
 441         else if (mbclen == (size_t) -1 || mbclen == 0)
 442           {
 443             /* It is an invalid character or '\0'.  Just use the byte.  */
 444             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 445
 446             if (BE (pstr->trans != NULL, 0))
 447               ch = pstr->trans [ch];
 448             pstr->mbs[byte_idx] = ch;
 449
 450             if (BE (pstr->offsets_needed != 0, 0))
 451               pstr->offsets[byte_idx] = src_idx;
 452             ++src_idx;
 453
 454             /* And also cast it to wide char.  */
 455             pstr->wcs[byte_idx++] = (wchar_t) ch;
 456             if (BE (mbclen == (size_t) -1, 0))
 457               pstr->cur_state = prev_st;
 458           }
 459         else
 460           {
 461             /* The buffer doesn't have enough space, finish to build.  */
 462             pstr->cur_state = prev_st;
 463             break;
 464           }
 465       }
 466   pstr->valid_len = byte_idx;
 467   pstr->valid_raw_len = src_idx;
 468   return REG_NOERROR;
 469 }
 470
 471 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 472    Return the index.  */
 473
 474 static int
 475 internal_function
 476 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
 477 {
 478   mbstate_t prev_st;
 479   int rawbuf_idx;
 480   size_t mbclen;
 481   wchar_t wc = 0;
 482
 483   /* Skip the characters which are not necessary to check.  */
 484   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 485        rawbuf_idx < new_raw_idx;)
 486     {
 487       int remain_len;
 488       remain_len = pstr->len - rawbuf_idx;
 489       prev_st = pstr->cur_state;
 490       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 491                         remain_len, &pstr->cur_state);
 492       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 493         {
 494           /* We treat these cases as a singlebyte character.  */
 495           mbclen = 1;
 496           pstr->cur_state = prev_st;
 497         }
 498       /* Then proceed the next character.  */
 499       rawbuf_idx += mbclen;
 500     }
 501   *last_wc = (wint_t) wc;
 502   return rawbuf_idx;
 503 }
 504 #endif /* RE_ENABLE_I18N  */
 505
 506 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 507    This function is used in case of REG_ICASE.  */
 508
 509 static void
 510 build_upper_buffer (re_string_t *pstr)
 511 {
 512   int char_idx, end_idx;
 513   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 514
 515   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 516     {
 517       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 518       if (BE (pstr->trans != NULL, 0))
 519         ch = pstr->trans[ch];
 520       if (islower (ch))
 521         pstr->mbs[char_idx] = toupper (ch);
 522       else
 523         pstr->mbs[char_idx] = ch;
 524     }
 525   pstr->valid_len = char_idx;
 526   pstr->valid_raw_len = char_idx;
 527 }
 528
 529 /* Apply TRANS to the buffer in PSTR.  */
 530
 531 static void
 532 re_string_translate_buffer (re_string_t *pstr)
 533 {
 534   int buf_idx, end_idx;
 535   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 536
 537   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 538     {
 539       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 540       pstr->mbs[buf_idx] = pstr->trans[ch];
 541     }
 542
 543   pstr->valid_len = buf_idx;
 544   pstr->valid_raw_len = buf_idx;
 545 }
 546
 547 /* This function re-construct the buffers.
 548    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 549    convert to upper case in case of REG_ICASE, apply translation.  */
 550
 551 static reg_errcode_t
 552 internal_function
 553 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 554 {
 555   int offset = idx - pstr->raw_mbs_idx;
 556   if (BE (offset < 0, 0))
 557     {
 558       /* Reset buffer.  */
 559 #ifdef RE_ENABLE_I18N
 560       if (pstr->mb_cur_max > 1)
 561         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 562 #endif /* RE_ENABLE_I18N */
 563       pstr->len = pstr->raw_len;
 564       pstr->stop = pstr->raw_stop;
 565       pstr->valid_len = 0;
 566       pstr->raw_mbs_idx = 0;
 567       pstr->valid_raw_len = 0;
 568       pstr->offsets_needed = 0;
 569       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 570                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 571       if (!pstr->mbs_allocated)
 572         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 573       offset = idx;
 574     }
 575
 576   if (BE (offset != 0, 1))
 577     {
 578       /* Are the characters which are already checked remain?  */
 579       if (BE (offset < pstr->valid_raw_len, 1)
 580 #ifdef RE_ENABLE_I18N
 581           /* Handling this would enlarge the code too much.
 582              Accept a slowdown in that case.  */
 583           && pstr->offsets_needed == 0
 584 #endif
 585          )
 586         {
 587           /* Yes, move them to the front of the buffer.  */
 588           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 589 #ifdef RE_ENABLE_I18N
 590           if (pstr->mb_cur_max > 1)
 591             memmove (pstr->wcs, pstr->wcs + offset,
 592                      (pstr->valid_len - offset) * sizeof (wint_t));
 593 #endif /* RE_ENABLE_I18N */
 594           if (BE (pstr->mbs_allocated, 0))
 595             memmove (pstr->mbs, pstr->mbs + offset,
 596                      pstr->valid_len - offset);
 597           pstr->valid_len -= offset;
 598           pstr->valid_raw_len -= offset;
 599 #if DEBUG
 600           assert (pstr->valid_len > 0);
 601 #endif
 602         }
 603       else
 604         {
 605           /* No, skip all characters until IDX.  */
 606 #ifdef RE_ENABLE_I18N
 607           if (BE (pstr->offsets_needed, 0))
 608             {
 609               pstr->len = pstr->raw_len - idx + offset;
 610               pstr->stop = pstr->raw_stop - idx + offset;
 611               pstr->offsets_needed = 0;
 612             }
 613 #endif
 614           pstr->valid_len = 0;
 615           pstr->valid_raw_len = 0;
 616 #ifdef RE_ENABLE_I18N
 617           if (pstr->mb_cur_max > 1)
 618             {
 619               int wcs_idx;
 620               wint_t wc = WEOF;
 621
 622               if (pstr->is_utf8)
 623                 {
 624                   const unsigned char *raw, *p, *q, *end;
 625
 626                   /* Special case UTF-8.  Multi-byte chars start with any
 627                      byte other than 0x80 - 0xbf.  */
 628                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 629                   end = raw + (offset - pstr->mb_cur_max);
 630                   p = raw + offset - 1;
 631 #ifdef _LIBC
 632                   /* We know the wchar_t encoding is UCS4, so for the simple
 633                      case, ASCII characters, skip the conversion step.  */
 634                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 635                     {
 636                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 637                       pstr->valid_len = 0;
 638                       wc = (wchar_t) *p;
 639                     }
 640                   else
 641 #endif
 642                     for (; p >= end; --p)
 643                       if ((*p & 0xc0) != 0x80)
 644                         {
 645                           mbstate_t cur_state;
 646                           wchar_t wc2;
 647                           int mlen = raw + pstr->len - p;
 648                           unsigned char buf[6];
 649                           size_t mbclen;
 650
 651                           q = p;
 652                           if (BE (pstr->trans != NULL, 0))
 653                             {
 654                               int i = mlen < 6 ? mlen : 6;
 655                               while (--i >= 0)
 656                                 buf[i] = pstr->trans[p[i]];
 657                               q = buf;
 658                             }
 659                           /* XXX Don't use mbrtowc, we know which conversion
 660                              to use (UTF-8 -> UCS4).  */
 661                           memset (&cur_state, 0, sizeof (cur_state));
 662                           mbclen = mbrtowc (&wc2, (const char *) p, mlen,
 663                                             &cur_state);
 664                           if (raw + offset - p <= mbclen
 665                               && mbclen < (size_t) -2)
 666                             {
 667                               memset (&pstr->cur_state, '\0',
 668                                       sizeof (mbstate_t));
 669                               pstr->valid_len = mbclen - (raw + offset - p);
 670                               wc = wc2;
 671                             }
 672                           break;
 673                         }
 674                 }
 675
 676               if (wc == WEOF)
 677                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 678               if (BE (pstr->valid_len, 0))
 679                 {
 680                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 681                     pstr->wcs[wcs_idx] = WEOF;
 682                   if (pstr->mbs_allocated)
 683                     memset (pstr->mbs, 255, pstr->valid_len);
 684                 }
 685               pstr->valid_raw_len = pstr->valid_len;
 686               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 687                                     && IS_WIDE_WORD_CHAR (wc))
 688                                    ? CONTEXT_WORD
 689                                    : ((IS_WIDE_NEWLINE (wc)
 690                                        && pstr->newline_anchor)
 691                                       ? CONTEXT_NEWLINE : 0));
 692             }
 693           else
 694 #endif /* RE_ENABLE_I18N */
 695             {
 696               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 697               if (pstr->trans)
 698                 c = pstr->trans[c];
 699               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 700                                    ? CONTEXT_WORD
 701                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 702                                       ? CONTEXT_NEWLINE : 0));
 703             }
 704         }
 705       if (!BE (pstr->mbs_allocated, 0))
 706         pstr->mbs += offset;
 707     }
 708   pstr->raw_mbs_idx = idx;
 709   pstr->len -= offset;
 710   pstr->stop -= offset;
 711
 712   /* Then build the buffers.  */
 713 #ifdef RE_ENABLE_I18N
 714   if (pstr->mb_cur_max > 1)
 715     {
 716       if (pstr->icase)
 717         {
 718           int ret = build_wcs_upper_buffer (pstr);
 719           if (BE (ret != REG_NOERROR, 0))
 720             return ret;
 721         }
 722       else
 723         build_wcs_buffer (pstr);
 724     }
 725   else
 726 #endif /* RE_ENABLE_I18N */
 727     if (BE (pstr->mbs_allocated, 0))
 728       {
 729         if (pstr->icase)
 730           build_upper_buffer (pstr);
 731         else if (pstr->trans != NULL)
 732           re_string_translate_buffer (pstr);
 733       }
 734     else
 735       pstr->valid_len = pstr->len;
 736
 737   pstr->cur_idx = 0;
 738   return REG_NOERROR;
 739 }
 740
 741 static unsigned char
 742 internal_function __attribute ((pure))
 743 re_string_peek_byte_case (const re_string_t *pstr, int idx)
 744 {
 745   int ch, off;
 746
 747   /* Handle the common (easiest) cases first.  */
 748   if (BE (!pstr->mbs_allocated, 1))
 749     return re_string_peek_byte (pstr, idx);
 750
 751 #ifdef RE_ENABLE_I18N
 752   if (pstr->mb_cur_max > 1
 753       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 754     return re_string_peek_byte (pstr, idx);
 755 #endif
 756
 757   off = pstr->cur_idx + idx;
 758 #ifdef RE_ENABLE_I18N
 759   if (pstr->offsets_needed)
 760     off = pstr->offsets[off];
 761 #endif
 762
 763   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 764
 765 #ifdef RE_ENABLE_I18N
 766   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 767      this function returns CAPITAL LETTER I instead of first byte of
 768      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 769      since peek_byte_case doesn't advance cur_idx in any way.  */
 770   if (pstr->offsets_needed && !isascii (ch))
 771     return re_string_peek_byte (pstr, idx);
 772 #endif
 773
 774   return ch;
 775 }
 776
 777 static unsigned char
 778 internal_function __attribute ((pure))
 779 re_string_fetch_byte_case (re_string_t *pstr)
 780 {
 781   if (BE (!pstr->mbs_allocated, 1))
 782     return re_string_fetch_byte (pstr);
 783
 784 #ifdef RE_ENABLE_I18N
 785   if (pstr->offsets_needed)
 786     {
 787       int off, ch;
 788
 789       /* For tr_TR.UTF-8 [[:islower:]] there is
 790          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 791          in that case the whole multi-byte character and return
 792          the original letter.  On the other side, with
 793          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 794          anything else would complicate things too much.  */
 795
 796       if (!re_string_first_byte (pstr, pstr->cur_idx))
 797         return re_string_fetch_byte (pstr);
 798
 799       off = pstr->offsets[pstr->cur_idx];
 800       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 801
 802       if (! isascii (ch))
 803         return re_string_fetch_byte (pstr);
 804
 805       re_string_skip_bytes (pstr,
 806                             re_string_char_size_at (pstr, pstr->cur_idx));
 807       return ch;
 808     }
 809 #endif
 810
 811   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 812 }
 813
 814 static void
 815 internal_function
 816 re_string_destruct (re_string_t *pstr)
 817 {
 818 #ifdef RE_ENABLE_I18N
 819   re_free (pstr->wcs);
 820   re_free (pstr->offsets);
 821 #endif /* RE_ENABLE_I18N  */
 822   if (pstr->mbs_allocated)
 823     re_free (pstr->mbs);
 824 }
 825
 826 /* Return the context at IDX in INPUT.  */
 827
 828 static unsigned int
 829 re_string_context_at (const re_string_t *input, int idx, int eflags)
 830 {
 831   int c;
 832   if (BE (idx < 0, 0))
 833     /* In this case, we use the value stored in input->tip_context,
 834        since we can't know the character in input->mbs[-1] here.  */
 835     return input->tip_context;
 836   if (BE (idx == input->len, 0))
 837     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 838             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 839 #ifdef RE_ENABLE_I18N
 840   if (input->mb_cur_max > 1)
 841     {
 842       wint_t wc;
 843       int wc_idx = idx;
 844       while(input->wcs[wc_idx] == WEOF)
 845         {
 846 #ifdef DEBUG
 847           /* It must not happen.  */
 848           assert (wc_idx >= 0);
 849 #endif
 850           --wc_idx;
 851           if (wc_idx < 0)
 852             return input->tip_context;
 853         }
 854       wc = input->wcs[wc_idx];
 855       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 856         return CONTEXT_WORD;
 857       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 858               ? CONTEXT_NEWLINE : 0);
 859     }
 860   else
 861 #endif
 862     {
 863       c = re_string_byte_at (input, idx);
 864       if (bitset_contain (input->word_char, c))
 865         return CONTEXT_WORD;
 866       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 867     }
 868 }
 869 \f
 870 /* Functions for set operation.  */
 871
 872 static reg_errcode_t
 873 internal_function
 874 re_node_set_alloc (re_node_set *set, int size)
 875 {
 876   set->alloc = size;
 877   set->nelem = 0;
 878   set->elems = re_malloc (int, size);
 879   if (BE (set->elems == NULL, 0))
 880     return REG_ESPACE;
 881   return REG_NOERROR;
 882 }
 883
 884 static reg_errcode_t
 885 internal_function
 886 re_node_set_init_1 (re_node_set *set, int elem)
 887 {
 888   set->alloc = 1;
 889   set->nelem = 1;
 890   set->elems = re_malloc (int, 1);
 891   if (BE (set->elems == NULL, 0))
 892     {
 893       set->alloc = set->nelem = 0;
 894       return REG_ESPACE;
 895     }
 896   set->elems[0] = elem;
 897   return REG_NOERROR;
 898 }
 899
 900 static reg_errcode_t
 901 internal_function
 902 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
 903 {
 904   set->alloc = 2;
 905   set->elems = re_malloc (int, 2);
 906   if (BE (set->elems == NULL, 0))
 907     return REG_ESPACE;
 908   if (elem1 == elem2)
 909     {
 910       set->nelem = 1;
 911       set->elems[0] = elem1;
 912     }
 913   else
 914     {
 915       set->nelem = 2;
 916       if (elem1 < elem2)
 917         {
 918           set->elems[0] = elem1;
 919           set->elems[1] = elem2;
 920         }
 921       else
 922         {
 923           set->elems[0] = elem2;
 924           set->elems[1] = elem1;
 925         }
 926     }
 927   return REG_NOERROR;
 928 }
 929
 930 static reg_errcode_t
 931 internal_function
 932 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 933 {
 934   dest->nelem = src->nelem;
 935   if (src->nelem > 0)
 936     {
 937       dest->alloc = dest->nelem;
 938       dest->elems = re_malloc (int, dest->alloc);
 939       if (BE (dest->elems == NULL, 0))
 940         {
 941           dest->alloc = dest->nelem = 0;
 942           return REG_ESPACE;
 943         }
 944       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
 945     }
 946   else
 947     re_node_set_init_empty (dest);
 948   return REG_NOERROR;
 949 }
 950
 951 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 952    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 953    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 954
 955 static reg_errcode_t
 956 internal_function
 957 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
 958                            const re_node_set *src2)
 959 {
 960   int i1, i2, is, id, delta, sbase;
 961   if (src1->nelem == 0 || src2->nelem == 0)
 962     return REG_NOERROR;
 963
 964   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 965      conservative estimate.  */
 966   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 967     {
 968       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
 969       int *new_elems = re_realloc (dest->elems, int, new_alloc);
 970       if (BE (new_elems == NULL, 0))
 971         return REG_ESPACE;
 972       dest->elems = new_elems;
 973       dest->alloc = new_alloc;
 974     }
 975
 976   /* Find the items in the intersection of SRC1 and SRC2, and copy
 977      into the top of DEST those that are not already in DEST itself.  */
 978   sbase = dest->nelem + src1->nelem + src2->nelem;
 979   i1 = src1->nelem - 1;
 980   i2 = src2->nelem - 1;
 981   id = dest->nelem - 1;
 982   for (;;)
 983     {
 984       if (src1->elems[i1] == src2->elems[i2])
 985         {
 986           /* Try to find the item in DEST.  Maybe we could binary search?  */
 987           while (id >= 0 && dest->elems[id] > src1->elems[i1])
 988             --id;
 989
 990           if (id < 0 || dest->elems[id] != src1->elems[i1])
 991             dest->elems[--sbase] = src1->elems[i1];
 992
 993           if (--i1 < 0 || --i2 < 0)
 994             break;
 995         }
 996
 997       /* Lower the highest of the two items.  */
 998       else if (src1->elems[i1] < src2->elems[i2])
 999         {
1000           if (--i2 < 0)
1001             break;
1002         }
1003       else
1004         {
1005           if (--i1 < 0)
1006             break;
1007         }
1008     }
1009
1010   id = dest->nelem - 1;
1011   is = dest->nelem + src1->nelem + src2->nelem - 1;
1012   delta = is - sbase + 1;
1013
1014   /* Now copy.  When DELTA becomes zero, the remaining
1015      DEST elements are already in place; this is more or
1016      less the same loop that is in re_node_set_merge.  */
1017   dest->nelem += delta;
1018   if (delta > 0 && id >= 0)
1019     for (;;)
1020       {
1021         if (dest->elems[is] > dest->elems[id])
1022           {
1023             /* Copy from the top.  */
1024             dest->elems[id + delta--] = dest->elems[is--];
1025             if (delta == 0)
1026               break;
1027           }
1028         else
1029           {
1030             /* Slide from the bottom.  */
1031             dest->elems[id + delta] = dest->elems[id];
1032             if (--id < 0)
1033               break;
1034           }
1035       }
1036
1037   /* Copy remaining SRC elements.  */
1038   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1039
1040   return REG_NOERROR;
1041 }
1042
1043 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1044    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1045
1046 static reg_errcode_t
1047 internal_function
1048 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1049                         const re_node_set *src2)
1050 {
1051   int i1, i2, id;
1052   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1053     {
1054       dest->alloc = src1->nelem + src2->nelem;
1055       dest->elems = re_malloc (int, dest->alloc);
1056       if (BE (dest->elems == NULL, 0))
1057         return REG_ESPACE;
1058     }
1059   else
1060     {
1061       if (src1 != NULL && src1->nelem > 0)
1062         return re_node_set_init_copy (dest, src1);
1063       else if (src2 != NULL && src2->nelem > 0)
1064         return re_node_set_init_copy (dest, src2);
1065       else
1066         re_node_set_init_empty (dest);
1067       return REG_NOERROR;
1068     }
1069   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1070     {
1071       if (src1->elems[i1] > src2->elems[i2])
1072         {
1073           dest->elems[id++] = src2->elems[i2++];
1074           continue;
1075         }
1076       if (src1->elems[i1] == src2->elems[i2])
1077         ++i2;
1078       dest->elems[id++] = src1->elems[i1++];
1079     }
1080   if (i1 < src1->nelem)
1081     {
1082       memcpy (dest->elems + id, src1->elems + i1,
1083              (src1->nelem - i1) * sizeof (int));
1084       id += src1->nelem - i1;
1085     }
1086   else if (i2 < src2->nelem)
1087     {
1088       memcpy (dest->elems + id, src2->elems + i2,
1089              (src2->nelem - i2) * sizeof (int));
1090       id += src2->nelem - i2;
1091     }
1092   dest->nelem = id;
1093   return REG_NOERROR;
1094 }
1095
1096 /* Calculate the union set of the sets DEST and SRC. And store it to
1097    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1098
1099 static reg_errcode_t
1100 internal_function
1101 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1102 {
1103   int is, id, sbase, delta;
1104   if (src == NULL || src->nelem == 0)
1105     return REG_NOERROR;
1106   if (dest->alloc < 2 * src->nelem + dest->nelem)
1107     {
1108       int new_alloc = 2 * (src->nelem + dest->alloc);
1109       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1110       if (BE (new_buffer == NULL, 0))
1111         return REG_ESPACE;
1112       dest->elems = new_buffer;
1113       dest->alloc = new_alloc;
1114     }
1115
1116   if (BE (dest->nelem == 0, 0))
1117     {
1118       dest->nelem = src->nelem;
1119       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1120       return REG_NOERROR;
1121     }
1122
1123   /* Copy into the top of DEST the items of SRC that are not
1124      found in DEST.  Maybe we could binary search in DEST?  */
1125   for (sbase = dest->nelem + 2 * src->nelem,
1126        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1127     {
1128       if (dest->elems[id] == src->elems[is])
1129         is--, id--;
1130       else if (dest->elems[id] < src->elems[is])
1131         dest->elems[--sbase] = src->elems[is--];
1132       else /* if (dest->elems[id] > src->elems[is]) */
1133         --id;
1134     }
1135
1136   if (is >= 0)
1137     {
1138       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1139       sbase -= is + 1;
1140       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1141     }
1142
1143   id = dest->nelem - 1;
1144   is = dest->nelem + 2 * src->nelem - 1;
1145   delta = is - sbase + 1;
1146   if (delta == 0)
1147     return REG_NOERROR;
1148
1149   /* Now copy.  When DELTA becomes zero, the remaining
1150      DEST elements are already in place.  */
1151   dest->nelem += delta;
1152   for (;;)
1153     {
1154       if (dest->elems[is] > dest->elems[id])
1155         {
1156           /* Copy from the top.  */
1157           dest->elems[id + delta--] = dest->elems[is--];
1158           if (delta == 0)
1159             break;
1160         }
1161       else
1162         {
1163           /* Slide from the bottom.  */
1164           dest->elems[id + delta] = dest->elems[id];
1165           if (--id < 0)
1166             {
1167               /* Copy remaining SRC elements.  */
1168               memcpy (dest->elems, dest->elems + sbase,
1169                       delta * sizeof (int));
1170               break;
1171             }
1172         }
1173     }
1174
1175   return REG_NOERROR;
1176 }
1177
1178 /* Insert the new element ELEM to the re_node_set* SET.
1179    SET should not already have ELEM.
1180    return -1 if an error is occured, return 1 otherwise.  */
1181
1182 static int
1183 internal_function
1184 re_node_set_insert (re_node_set *set, int elem)
1185 {
1186   int idx;
1187   /* In case the set is empty.  */
1188   if (set->alloc == 0)
1189     {
1190       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1191         return 1;
1192       else
1193         return -1;
1194     }
1195
1196   if (BE (set->nelem, 0) == 0)
1197     {
1198       /* We already guaranteed above that set->alloc != 0.  */
1199       set->elems[0] = elem;
1200       ++set->nelem;
1201       return 1;
1202     }
1203
1204   /* Realloc if we need.  */
1205   if (set->alloc == set->nelem)
1206     {
1207       int *new_elems;
1208       set->alloc = set->alloc * 2;
1209       new_elems = re_realloc (set->elems, int, set->alloc);
1210       if (BE (new_elems == NULL, 0))
1211         return -1;
1212       set->elems = new_elems;
1213     }
1214
1215   /* Move the elements which follows the new element.  Test the
1216      first element separately to skip a check in the inner loop.  */
1217   if (elem < set->elems[0])
1218     {
1219       idx = 0;
1220       for (idx = set->nelem; idx > 0; idx--)
1221         set->elems[idx] = set->elems[idx - 1];
1222     }
1223   else
1224     {
1225       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1226         set->elems[idx] = set->elems[idx - 1];
1227     }
1228
1229   /* Insert the new element.  */
1230   set->elems[idx] = elem;
1231   ++set->nelem;
1232   return 1;
1233 }
1234
1235 /* Insert the new element ELEM to the re_node_set* SET.
1236    SET should not already have any element greater than or equal to ELEM.
1237    Return -1 if an error is occured, return 1 otherwise.  */
1238
1239 static int
1240 internal_function
1241 re_node_set_insert_last (re_node_set *set, int elem)
1242 {
1243   /* Realloc if we need.  */
1244   if (set->alloc == set->nelem)
1245     {
1246       int *new_elems;
1247       set->alloc = (set->alloc + 1) * 2;
1248       new_elems = re_realloc (set->elems, int, set->alloc);
1249       if (BE (new_elems == NULL, 0))
1250         return -1;
1251       set->elems = new_elems;
1252     }
1253
1254   /* Insert the new element.  */
1255   set->elems[set->nelem++] = elem;
1256   return 1;
1257 }
1258
1259 /* Compare two node sets SET1 and SET2.
1260    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1261
1262 static int
1263 internal_function __attribute ((pure))
1264 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1265 {
1266   int i;
1267   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1268     return 0;
1269   for (i = set1->nelem ; --i >= 0 ; )
1270     if (set1->elems[i] != set2->elems[i])
1271       return 0;
1272   return 1;
1273 }
1274
1275 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1276
1277 static int
1278 internal_function __attribute ((pure))
1279 re_node_set_contains (const re_node_set *set, int elem)
1280 {
1281   unsigned int idx, right, mid;
1282   if (set->nelem <= 0)
1283     return 0;
1284
1285   /* Binary search the element.  */
1286   idx = 0;
1287   right = set->nelem - 1;
1288   while (idx < right)
1289     {
1290       mid = (idx + right) / 2;
1291       if (set->elems[mid] < elem)
1292         idx = mid + 1;
1293       else
1294         right = mid;
1295     }
1296   return set->elems[idx] == elem ? idx + 1 : 0;
1297 }
1298
1299 static void
1300 internal_function
1301 re_node_set_remove_at (re_node_set *set, int idx)
1302 {
1303   if (idx < 0 || idx >= set->nelem)
1304     return;
1305   --set->nelem;
1306   for (; idx < set->nelem; idx++)
1307     set->elems[idx] = set->elems[idx + 1];
1308 }
1309 \f
1310
1311 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1312    Or return -1, if an error will be occured.  */
1313
1314 static int
1315 internal_function
1316 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1317 {
1318   int type = token.type;
1319   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1320     {
1321       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1322       int *new_nexts, *new_indices;
1323       re_node_set *new_edests, *new_eclosures;
1324       re_token_t *new_nodes;
1325
1326       /* Avoid overflows.  */
1327       if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1328         return -1;
1329
1330       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1331       if (BE (new_nodes == NULL, 0))
1332         return -1;
1333       dfa->nodes = new_nodes;
1334       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1335       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1336       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1337       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1338       if (BE (new_nexts == NULL || new_indices == NULL
1339               || new_edests == NULL || new_eclosures == NULL, 0))
1340         return -1;
1341       dfa->nexts = new_nexts;
1342       dfa->org_indices = new_indices;
1343       dfa->edests = new_edests;
1344       dfa->eclosures = new_eclosures;
1345       dfa->nodes_alloc = new_nodes_alloc;
1346     }
1347   dfa->nodes[dfa->nodes_len] = token;
1348   dfa->nodes[dfa->nodes_len].constraint = 0;
1349 #ifdef RE_ENABLE_I18N
1350   dfa->nodes[dfa->nodes_len].accept_mb =
1351     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1352 #endif
1353   dfa->nexts[dfa->nodes_len] = -1;
1354   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1355   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1356   return dfa->nodes_len++;
1357 }
1358
1359 static inline unsigned int
1360 internal_function
1361 calc_state_hash (const re_node_set *nodes, unsigned int context)
1362 {
1363   unsigned int hash = nodes->nelem + context;
1364   int i;
1365   for (i = 0 ; i < nodes->nelem ; i++)
1366     hash += nodes->elems[i];
1367   return hash;
1368 }
1369
1370 /* Search for the state whose node_set is equivalent to NODES.
1371    Return the pointer to the state, if we found it in the DFA.
1372    Otherwise create the new one and return it.  In case of an error
1373    return NULL and set the error code in ERR.
1374    Note: - We assume NULL as the invalid state, then it is possible that
1375            return value is NULL and ERR is REG_NOERROR.
1376          - We never return non-NULL value in case of any errors, it is for
1377            optimization.  */
1378
1379 static re_dfastate_t *
1380 internal_function
1381 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1382                   const re_node_set *nodes)
1383 {
1384   unsigned int hash;
1385   re_dfastate_t *new_state;
1386   struct re_state_table_entry *spot;
1387   int i;
1388   if (BE (nodes->nelem == 0, 0))
1389     {
1390       *err = REG_NOERROR;
1391       return NULL;
1392     }
1393   hash = calc_state_hash (nodes, 0);
1394   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1395
1396   for (i = 0 ; i < spot->num ; i++)
1397     {
1398       re_dfastate_t *state = spot->array[i];
1399       if (hash != state->hash)
1400         continue;
1401       if (re_node_set_compare (&state->nodes, nodes))
1402         return state;
1403     }
1404
1405   /* There are no appropriate state in the dfa, create the new one.  */
1406   new_state = create_ci_newstate (dfa, nodes, hash);
1407   if (BE (new_state == NULL, 0))
1408     *err = REG_ESPACE;
1409
1410   return new_state;
1411 }
1412
1413 /* Search for the state whose node_set is equivalent to NODES and
1414    whose context is equivalent to CONTEXT.
1415    Return the pointer to the state, if we found it in the DFA.
1416    Otherwise create the new one and return it.  In case of an error
1417    return NULL and set the error code in ERR.
1418    Note: - We assume NULL as the invalid state, then it is possible that
1419            return value is NULL and ERR is REG_NOERROR.
1420          - We never return non-NULL value in case of any errors, it is for
1421            optimization.  */
1422
1423 static re_dfastate_t *
1424 internal_function
1425 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1426                           const re_node_set *nodes, unsigned int context)
1427 {
1428   unsigned int hash;
1429   re_dfastate_t *new_state;
1430   struct re_state_table_entry *spot;
1431   int i;
1432   if (nodes->nelem == 0)
1433     {
1434       *err = REG_NOERROR;
1435       return NULL;
1436     }
1437   hash = calc_state_hash (nodes, context);
1438   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1439
1440   for (i = 0 ; i < spot->num ; i++)
1441     {
1442       re_dfastate_t *state = spot->array[i];
1443       if (state->hash == hash
1444           && state->context == context
1445           && re_node_set_compare (state->entrance_nodes, nodes))
1446         return state;
1447     }
1448   /* There are no appropriate state in `dfa', create the new one.  */
1449   new_state = create_cd_newstate (dfa, nodes, context, hash);
1450   if (BE (new_state == NULL, 0))
1451     *err = REG_ESPACE;
1452
1453   return new_state;
1454 }
1455
1456 /* Finish initialization of the new state NEWSTATE, and using its hash value
1457    HASH put in the appropriate bucket of DFA's state table.  Return value
1458    indicates the error code if failed.  */
1459
1460 static reg_errcode_t
1461 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1462                 unsigned int hash)
1463 {
1464   struct re_state_table_entry *spot;
1465   reg_errcode_t err;
1466   int i;
1467
1468   newstate->hash = hash;
1469   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1470   if (BE (err != REG_NOERROR, 0))
1471     return REG_ESPACE;
1472   for (i = 0; i < newstate->nodes.nelem; i++)
1473     {
1474       int elem = newstate->nodes.elems[i];
1475       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1476         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1477     }
1478
1479   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1480   if (BE (spot->alloc <= spot->num, 0))
1481     {
1482       int new_alloc = 2 * spot->num + 2;
1483       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1484                                               new_alloc);
1485       if (BE (new_array == NULL, 0))
1486         return REG_ESPACE;
1487       spot->array = new_array;
1488       spot->alloc = new_alloc;
1489     }
1490   spot->array[spot->num++] = newstate;
1491   return REG_NOERROR;
1492 }
1493
1494 static void
1495 free_state (re_dfastate_t *state)
1496 {
1497   re_node_set_free (&state->non_eps_nodes);
1498   re_node_set_free (&state->inveclosure);
1499   if (state->entrance_nodes != &state->nodes)
1500     {
1501       re_node_set_free (state->entrance_nodes);
1502       re_free (state->entrance_nodes);
1503     }
1504   re_node_set_free (&state->nodes);
1505   re_free (state->word_trtable);
1506   re_free (state->trtable);
1507   re_free (state);
1508 }
1509
1510 /* Create the new state which is independ of contexts.
1511    Return the new state if succeeded, otherwise return NULL.  */
1512
1513 static re_dfastate_t *
1514 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1515                     unsigned int hash)
1516 {
1517   int i;
1518   reg_errcode_t err;
1519   re_dfastate_t *newstate;
1520
1521   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1522   if (BE (newstate == NULL, 0))
1523     return NULL;
1524   err = re_node_set_init_copy (&newstate->nodes, nodes);
1525   if (BE (err != REG_NOERROR, 0))
1526     {
1527       re_free (newstate);
1528       return NULL;
1529     }
1530
1531   newstate->entrance_nodes = &newstate->nodes;
1532   for (i = 0 ; i < nodes->nelem ; i++)
1533     {
1534       re_token_t *node = dfa->nodes + nodes->elems[i];
1535       re_token_type_t type = node->type;
1536       if (type == CHARACTER && !node->constraint)
1537         continue;
1538 #ifdef RE_ENABLE_I18N
1539       newstate->accept_mb |= node->accept_mb;
1540 #endif /* RE_ENABLE_I18N */
1541
1542       /* If the state has the halt node, the state is a halt state.  */
1543       if (type == END_OF_RE)
1544         newstate->halt = 1;
1545       else if (type == OP_BACK_REF)
1546         newstate->has_backref = 1;
1547       else if (type == ANCHOR || node->constraint)
1548         newstate->has_constraint = 1;
1549     }
1550   err = register_state (dfa, newstate, hash);
1551   if (BE (err != REG_NOERROR, 0))
1552     {
1553       free_state (newstate);
1554       newstate = NULL;
1555     }
1556   return newstate;
1557 }
1558
1559 /* Create the new state which is depend on the context CONTEXT.
1560    Return the new state if succeeded, otherwise return NULL.  */
1561
1562 static re_dfastate_t *
1563 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1564                     unsigned int context, unsigned int hash)
1565 {
1566   int i, nctx_nodes = 0;
1567   reg_errcode_t err;
1568   re_dfastate_t *newstate;
1569
1570   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1571   if (BE (newstate == NULL, 0))
1572     return NULL;
1573   err = re_node_set_init_copy (&newstate->nodes, nodes);
1574   if (BE (err != REG_NOERROR, 0))
1575     {
1576       re_free (newstate);
1577       return NULL;
1578     }
1579
1580   newstate->context = context;
1581   newstate->entrance_nodes = &newstate->nodes;
1582
1583   for (i = 0 ; i < nodes->nelem ; i++)
1584     {
1585       unsigned int constraint = 0;
1586       re_token_t *node = dfa->nodes + nodes->elems[i];
1587       re_token_type_t type = node->type;
1588       if (node->constraint)
1589         constraint = node->constraint;
1590
1591       if (type == CHARACTER && !constraint)
1592         continue;
1593 #ifdef RE_ENABLE_I18N
1594       newstate->accept_mb |= node->accept_mb;
1595 #endif /* RE_ENABLE_I18N */
1596
1597       /* If the state has the halt node, the state is a halt state.  */
1598       if (type == END_OF_RE)
1599         newstate->halt = 1;
1600       else if (type == OP_BACK_REF)
1601         newstate->has_backref = 1;
1602       else if (type == ANCHOR)
1603         constraint = node->opr.ctx_type;
1604
1605       if (constraint)
1606         {
1607           if (newstate->entrance_nodes == &newstate->nodes)
1608             {
1609               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1610               if (BE (newstate->entrance_nodes == NULL, 0))
1611                 {
1612                   free_state (newstate);
1613                   return NULL;
1614                 }
1615               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1616               nctx_nodes = 0;
1617               newstate->has_constraint = 1;
1618             }
1619
1620           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1621             {
1622               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1623               ++nctx_nodes;
1624             }
1625         }
1626     }
1627   err = register_state (dfa, newstate, hash);
1628   if (BE (err != REG_NOERROR, 0))
1629     {
1630       free_state (newstate);
1631       newstate = NULL;
1632     }
1633   return  newstate;
1634 }