compat/regex/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19    02110-1301 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  56      internal_function;
  57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  58                            int *p_match_first) internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes)
  69      internal_function;
  70 static reg_errcode_t set_regs (const regex_t *preg,
  71                                const re_match_context_t *mctx,
  72                                size_t nmatch, regmatch_t *pmatch,
  73                                int fl_backtrack) internal_function;
  74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  75      internal_function;
  76
  77 #ifdef RE_ENABLE_I18N
  78 static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                 re_sift_context_t *sctx,
  80                                 int node_idx, int str_idx, int max_str_idx)
  81      internal_function;
  82 #endif /* RE_ENABLE_I18N */
  83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  84                                            re_sift_context_t *sctx)
  85      internal_function;
  86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  87                                           re_sift_context_t *sctx, int str_idx,
  88                                           re_node_set *cur_dest)
  89      internal_function;
  90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  91                                               re_sift_context_t *sctx,
  92                                               int str_idx,
  93                                               re_node_set *dest_nodes)
  94      internal_function;
  95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  96                                             re_node_set *dest_nodes,
  97                                             const re_node_set *candidates)
  98      internal_function;
  99 static int check_dst_limits (const re_match_context_t *mctx,
 100                              re_node_set *limits,
 101                              int dst_node, int dst_idx, int src_node,
 102                              int src_idx) internal_function;
 103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 104                                         int boundaries, int subexp_idx,
 105                                         int from_node, int bkref_idx)
 106      internal_function;
 107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 108                                       int limit, int subexp_idx,
 109                                       int node, int str_idx,
 110                                       int bkref_idx) internal_function;
 111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 112                                           re_node_set *dest_nodes,
 113                                           const re_node_set *candidates,
 114                                           re_node_set *limits,
 115                                           struct re_backref_cache_entry *bkref_ents,
 116                                           int str_idx) internal_function;
 117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 118                                         re_sift_context_t *sctx,
 119                                         int str_idx, const re_node_set *candidates)
 120      internal_function;
 121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 122                                         re_dfastate_t **dst,
 123                                         re_dfastate_t **src, int num)
 124      internal_function;
 125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 126                                          re_match_context_t *mctx) internal_function;
 127 static re_dfastate_t *transit_state (reg_errcode_t *err,
 128                                      re_match_context_t *mctx,
 129                                      re_dfastate_t *state) internal_function;
 130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 131                                             re_match_context_t *mctx,
 132                                             re_dfastate_t *next_state)
 133      internal_function;
 134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                 re_node_set *cur_nodes,
 136                                                 int str_idx) internal_function;
 137 #if 0
 138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                         re_match_context_t *mctx,
 140                                         re_dfastate_t *pstate)
 141      internal_function;
 142 #endif
 143 #ifdef RE_ENABLE_I18N
 144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 145                                        re_dfastate_t *pstate)
 146      internal_function;
 147 #endif /* RE_ENABLE_I18N */
 148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 149                                           const re_node_set *nodes)
 150      internal_function;
 151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                  int bkref_node, int bkref_str_idx)
 153      internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str)
 158      internal_function;
 159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 160                              int subexp_idx, int type) internal_function;
 161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 162                                     state_array_t *path, int top_node,
 163                                     int top_str, int last_node, int last_str,
 164                                     int type) internal_function;
 165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 166                                                    int str_idx,
 167                                                    re_node_set *cur_nodes,
 168                                                    re_node_set *next_nodes)
 169      internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 171                                                re_node_set *cur_nodes,
 172                                                int ex_subexp, int type)
 173      internal_function;
 174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 175                                                    re_node_set *dst_nodes,
 176                                                    int target, int ex_subexp,
 177                                                    int type) internal_function;
 178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 179                                          re_node_set *cur_nodes, int cur_str,
 180                                          int subexp_num, int type)
 181      internal_function;
 182 static int build_trtable (const re_dfa_t *dfa,
 183                           re_dfastate_t *state) internal_function;
 184 #ifdef RE_ENABLE_I18N
 185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 186                                     const re_string_t *input, int idx)
 187      internal_function;
 188 # ifdef _LIBC
 189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 190                                                    size_t name_len)
 191      internal_function;
 192 # endif /* _LIBC */
 193 #endif /* RE_ENABLE_I18N */
 194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 195                                        const re_dfastate_t *state,
 196                                        re_node_set *states_node,
 197                                        bitset_t *states_ch) internal_function;
 198 static int check_node_accept (const re_match_context_t *mctx,
 199                               const re_token_t *node, int idx)
 200      internal_function;
 201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 202      internal_function;
 203 \f
 204 /* Entry point for POSIX code.  */
 205
 206 /* regexec searches for a given pattern, specified by PREG, in the
 207    string STRING.
 208
 209    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 210    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 211    least NMATCH elements, and we set them to the offsets of the
 212    corresponding matched substrings.
 213
 214    EFLAGS specifies `execution flags' which affect matching: if
 215    REG_NOTBOL is set, then ^ does not match at the beginning of the
 216    string; if REG_NOTEOL is set, then $ does not match at the end.
 217
 218    We return 0 if we find a match and REG_NOMATCH if not.  */
 219
 220 int
 221 regexec (preg, string, nmatch, pmatch, eflags)
 222     const regex_t *__restrict preg;
 223     const char *__restrict string;
 224     size_t nmatch;
 225     regmatch_t pmatch[];
 226     int eflags;
 227 {
 228   reg_errcode_t err;
 229   int start, length;
 230
 231   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 232     return REG_BADPAT;
 233
 234   if (eflags & REG_STARTEND)
 235     {
 236       start = pmatch[0].rm_so;
 237       length = pmatch[0].rm_eo;
 238     }
 239   else
 240     {
 241       start = 0;
 242       length = strlen (string);
 243     }
 244
 245   __libc_lock_lock (dfa->lock);
 246   if (preg->no_sub)
 247     err = re_search_internal (preg, string, length, start, length - start,
 248                               length, 0, NULL, eflags);
 249   else
 250     err = re_search_internal (preg, string, length, start, length - start,
 251                               length, nmatch, pmatch, eflags);
 252   __libc_lock_unlock (dfa->lock);
 253   return err != REG_NOERROR;
 254 }
 255
 256 #ifdef _LIBC
 257 # include <shlib-compat.h>
 258 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 259
 260 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 261 __typeof__ (__regexec) __compat_regexec;
 262
 263 int
 264 attribute_compat_text_section
 265 __compat_regexec (const regex_t *__restrict preg,
 266                   const char *__restrict string, size_t nmatch,
 267                   regmatch_t pmatch[], int eflags)
 268 {
 269   return regexec (preg, string, nmatch, pmatch,
 270                   eflags & (REG_NOTBOL | REG_NOTEOL));
 271 }
 272 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 273 # endif
 274 #endif
 275
 276 /* Entry points for GNU code.  */
 277
 278 /* re_match, re_search, re_match_2, re_search_2
 279
 280    The former two functions operate on STRING with length LENGTH,
 281    while the later two operate on concatenation of STRING1 and STRING2
 282    with lengths LENGTH1 and LENGTH2, respectively.
 283
 284    re_match() matches the compiled pattern in BUFP against the string,
 285    starting at index START.
 286
 287    re_search() first tries matching at index START, then it tries to match
 288    starting from index START + 1, and so on.  The last start position tried
 289    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 290    way as re_match().)
 291
 292    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 293    the first STOP characters of the concatenation of the strings should be
 294    concerned.
 295
 296    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 297    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 298    computed relative to the concatenation, not relative to the individual
 299    strings.)
 300
 301    On success, re_match* functions return the length of the match, re_search*
 302    return the position of the start of the match.  Return value -1 means no
 303    match was found and -2 indicates an internal error.  */
 304
 305 int
 306 re_match (bufp, string, length, start, regs)
 307     struct re_pattern_buffer *bufp;
 308     const char *string;
 309     int length, start;
 310     struct re_registers *regs;
 311 {
 312   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 313 }
 314 #ifdef _LIBC
 315 weak_alias (__re_match, re_match)
 316 #endif
 317
 318 int
 319 re_search (bufp, string, length, start, range, regs)
 320     struct re_pattern_buffer *bufp;
 321     const char *string;
 322     int length, start, range;
 323     struct re_registers *regs;
 324 {
 325   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 326 }
 327 #ifdef _LIBC
 328 weak_alias (__re_search, re_search)
 329 #endif
 330
 331 int
 332 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 333     struct re_pattern_buffer *bufp;
 334     const char *string1, *string2;
 335     int length1, length2, start, stop;
 336     struct re_registers *regs;
 337 {
 338   return re_search_2_stub (bufp, string1, length1, string2, length2,
 339                            start, 0, regs, stop, 1);
 340 }
 341 #ifdef _LIBC
 342 weak_alias (__re_match_2, re_match_2)
 343 #endif
 344
 345 int
 346 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 347     struct re_pattern_buffer *bufp;
 348     const char *string1, *string2;
 349     int length1, length2, start, range, stop;
 350     struct re_registers *regs;
 351 {
 352   return re_search_2_stub (bufp, string1, length1, string2, length2,
 353                            start, range, regs, stop, 0);
 354 }
 355 #ifdef _LIBC
 356 weak_alias (__re_search_2, re_search_2)
 357 #endif
 358
 359 static int
 360 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 361                   stop, ret_len)
 362     struct re_pattern_buffer *bufp;
 363     const char *string1, *string2;
 364     int length1, length2, start, range, stop, ret_len;
 365     struct re_registers *regs;
 366 {
 367   const char *str;
 368   int rval;
 369   int len = length1 + length2;
 370   int free_str = 0;
 371
 372   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 373     return -2;
 374
 375   /* Concatenate the strings.  */
 376   if (length2 > 0)
 377     if (length1 > 0)
 378       {
 379         char *s = re_malloc (char, len);
 380
 381         if (BE (s == NULL, 0))
 382           return -2;
 383         memcpy (s, string1, length1);
 384         memcpy (s + length1, string2, length2);
 385         str = s;
 386         free_str = 1;
 387       }
 388     else
 389       str = string2;
 390   else
 391     str = string1;
 392
 393   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 394   if (free_str)
 395     re_free ((char *) str);
 396   return rval;
 397 }
 398
 399 /* The parameters have the same meaning as those of re_search.
 400    Additional parameters:
 401    If RET_LEN is nonzero the length of the match is returned (re_match style);
 402    otherwise the position of the match is returned.  */
 403
 404 static int
 405 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 406     struct re_pattern_buffer *bufp;
 407     const char *string;
 408     int length, start, range, stop, ret_len;
 409     struct re_registers *regs;
 410 {
 411   reg_errcode_t result;
 412   regmatch_t *pmatch;
 413   int nregs, rval;
 414   int eflags = 0;
 415
 416   /* Check for out-of-range.  */
 417   if (BE (start < 0 || start > length, 0))
 418     return -1;
 419   if (BE (start + range > length, 0))
 420     range = length - start;
 421   else if (BE (start + range < 0, 0))
 422     range = -start;
 423
 424   __libc_lock_lock (dfa->lock);
 425
 426   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 427   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 428
 429   /* Compile fastmap if we haven't yet.  */
 430   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 431     re_compile_fastmap (bufp);
 432
 433   if (BE (bufp->no_sub, 0))
 434     regs = NULL;
 435
 436   /* We need at least 1 register.  */
 437   if (regs == NULL)
 438     nregs = 1;
 439   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 440                regs->num_regs < bufp->re_nsub + 1, 0))
 441     {
 442       nregs = regs->num_regs;
 443       if (BE (nregs < 1, 0))
 444         {
 445           /* Nothing can be copied to regs.  */
 446           regs = NULL;
 447           nregs = 1;
 448         }
 449     }
 450   else
 451     nregs = bufp->re_nsub + 1;
 452   pmatch = re_malloc (regmatch_t, nregs);
 453   if (BE (pmatch == NULL, 0))
 454     {
 455       rval = -2;
 456       goto out;
 457     }
 458
 459   result = re_search_internal (bufp, string, length, start, range, stop,
 460                                nregs, pmatch, eflags);
 461
 462   rval = 0;
 463
 464   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 465   if (result != REG_NOERROR)
 466     rval = -1;
 467   else if (regs != NULL)
 468     {
 469       /* If caller wants register contents data back, copy them.  */
 470       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 471                                            bufp->regs_allocated);
 472       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 473         rval = -2;
 474     }
 475
 476   if (BE (rval == 0, 1))
 477     {
 478       if (ret_len)
 479         {
 480           assert (pmatch[0].rm_so == start);
 481           rval = pmatch[0].rm_eo - start;
 482         }
 483       else
 484         rval = pmatch[0].rm_so;
 485     }
 486   re_free (pmatch);
 487  out:
 488   __libc_lock_unlock (dfa->lock);
 489   return rval;
 490 }
 491
 492 static unsigned
 493 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 494     struct re_registers *regs;
 495     regmatch_t *pmatch;
 496     int nregs, regs_allocated;
 497 {
 498   int rval = REGS_REALLOCATE;
 499   int i;
 500   int need_regs = nregs + 1;
 501   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 502      uses.  */
 503
 504   /* Have the register data arrays been allocated?  */
 505   if (regs_allocated == REGS_UNALLOCATED)
 506     { /* No.  So allocate them with malloc.  */
 507       regs->start = re_malloc (regoff_t, need_regs);
 508       if (BE (regs->start == NULL, 0))
 509         return REGS_UNALLOCATED;
 510       regs->end = re_malloc (regoff_t, need_regs);
 511       if (BE (regs->end == NULL, 0))
 512         {
 513           re_free (regs->start);
 514           return REGS_UNALLOCATED;
 515         }
 516       regs->num_regs = need_regs;
 517     }
 518   else if (regs_allocated == REGS_REALLOCATE)
 519     { /* Yes.  If we need more elements than were already
 520          allocated, reallocate them.  If we need fewer, just
 521          leave it alone.  */
 522       if (BE (need_regs > regs->num_regs, 0))
 523         {
 524           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 525           regoff_t *new_end;
 526           if (BE (new_start == NULL, 0))
 527             return REGS_UNALLOCATED;
 528           new_end = re_realloc (regs->end, regoff_t, need_regs);
 529           if (BE (new_end == NULL, 0))
 530             {
 531               re_free (new_start);
 532               return REGS_UNALLOCATED;
 533             }
 534           regs->start = new_start;
 535           regs->end = new_end;
 536           regs->num_regs = need_regs;
 537         }
 538     }
 539   else
 540     {
 541       assert (regs_allocated == REGS_FIXED);
 542       /* This function may not be called with REGS_FIXED and nregs too big.  */
 543       assert (regs->num_regs >= nregs);
 544       rval = REGS_FIXED;
 545     }
 546
 547   /* Copy the regs.  */
 548   for (i = 0; i < nregs; ++i)
 549     {
 550       regs->start[i] = pmatch[i].rm_so;
 551       regs->end[i] = pmatch[i].rm_eo;
 552     }
 553   for ( ; i < regs->num_regs; ++i)
 554     regs->start[i] = regs->end[i] = -1;
 555
 556   return rval;
 557 }
 558
 559 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 560    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 561    this memory for recording register information.  STARTS and ENDS
 562    must be allocated using the malloc library routine, and must each
 563    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 564
 565    If NUM_REGS == 0, then subsequent matches should allocate their own
 566    register data.
 567
 568    Unless this function is called, the first search or match using
 569    PATTERN_BUFFER will allocate its own register data, without
 570    freeing the old data.  */
 571
 572 void
 573 re_set_registers (bufp, regs, num_regs, starts, ends)
 574     struct re_pattern_buffer *bufp;
 575     struct re_registers *regs;
 576     unsigned num_regs;
 577     regoff_t *starts, *ends;
 578 {
 579   if (num_regs)
 580     {
 581       bufp->regs_allocated = REGS_REALLOCATE;
 582       regs->num_regs = num_regs;
 583       regs->start = starts;
 584       regs->end = ends;
 585     }
 586   else
 587     {
 588       bufp->regs_allocated = REGS_UNALLOCATED;
 589       regs->num_regs = 0;
 590       regs->start = regs->end = (regoff_t *) 0;
 591     }
 592 }
 593 #ifdef _LIBC
 594 weak_alias (__re_set_registers, re_set_registers)
 595 #endif
 596 \f
 597 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 598    them unless specifically requested.  */
 599
 600 #if defined _REGEX_RE_COMP || defined _LIBC
 601 int
 602 # ifdef _LIBC
 603 weak_function
 604 # endif
 605 re_exec (s)
 606      const char *s;
 607 {
 608   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 609 }
 610 #endif /* _REGEX_RE_COMP */
 611 \f
 612 /* Internal entry point.  */
 613
 614 /* Searches for a compiled pattern PREG in the string STRING, whose
 615    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 616    mingings with regexec.  START, and RANGE have the same meanings
 617    with re_search.
 618    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 619    otherwise return the error code.
 620    Note: We assume front end functions already check ranges.
 621    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 622
 623 static reg_errcode_t
 624 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 625                     eflags)
 626     const regex_t *preg;
 627     const char *string;
 628     int length, start, range, stop, eflags;
 629     size_t nmatch;
 630     regmatch_t pmatch[];
 631 {
 632   reg_errcode_t err;
 633   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 634   int left_lim, right_lim, incr;
 635   int fl_longest_match, match_first, match_kind, match_last = -1;
 636   int extra_nmatch;
 637   int sb, ch;
 638 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 639   re_match_context_t mctx = { .dfa = dfa };
 640 #else
 641   re_match_context_t mctx;
 642 #endif
 643   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 644                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 645   RE_TRANSLATE_TYPE t = preg->translate;
 646
 647 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 648   memset (&mctx, '\0', sizeof (re_match_context_t));
 649   mctx.dfa = dfa;
 650 #endif
 651
 652   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 653   nmatch -= extra_nmatch;
 654
 655   /* Check if the DFA haven't been compiled.  */
 656   if (BE (preg->used == 0 || dfa->init_state == NULL
 657           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 658           || dfa->init_state_begbuf == NULL, 0))
 659     return REG_NOMATCH;
 660
 661 #ifdef DEBUG
 662   /* We assume front-end functions already check them.  */
 663   assert (start + range >= 0 && start + range <= length);
 664 #endif
 665
 666   /* If initial states with non-begbuf contexts have no elements,
 667      the regex must be anchored.  If preg->newline_anchor is set,
 668      we'll never use init_state_nl, so do not check it.  */
 669   if (dfa->init_state->nodes.nelem == 0
 670       && dfa->init_state_word->nodes.nelem == 0
 671       && (dfa->init_state_nl->nodes.nelem == 0
 672           || !preg->newline_anchor))
 673     {
 674       if (start != 0 && start + range != 0)
 675         return REG_NOMATCH;
 676       start = range = 0;
 677     }
 678
 679   /* We must check the longest matching, if nmatch > 0.  */
 680   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 681
 682   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 683                             preg->translate, preg->syntax & RE_ICASE, dfa);
 684   if (BE (err != REG_NOERROR, 0))
 685     goto free_return;
 686   mctx.input.stop = stop;
 687   mctx.input.raw_stop = stop;
 688   mctx.input.newline_anchor = preg->newline_anchor;
 689
 690   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 691   if (BE (err != REG_NOERROR, 0))
 692     goto free_return;
 693
 694   /* We will log all the DFA states through which the dfa pass,
 695      if nmatch > 1, or this dfa has "multibyte node", which is a
 696      back-reference or a node which can accept multibyte character or
 697      multi character collating element.  */
 698   if (nmatch > 1 || dfa->has_mb_node)
 699     {
 700       /* Avoid overflow.  */
 701       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 702         {
 703           err = REG_ESPACE;
 704           goto free_return;
 705         }
 706
 707       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 708       if (BE (mctx.state_log == NULL, 0))
 709         {
 710           err = REG_ESPACE;
 711           goto free_return;
 712         }
 713     }
 714   else
 715     mctx.state_log = NULL;
 716
 717   match_first = start;
 718   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 719                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 720
 721   /* Check incrementally whether of not the input string match.  */
 722   incr = (range < 0) ? -1 : 1;
 723   left_lim = (range < 0) ? start + range : start;
 724   right_lim = (range < 0) ? start : start + range;
 725   sb = dfa->mb_cur_max == 1;
 726   match_kind =
 727     (fastmap
 728      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 729         | (range >= 0 ? 2 : 0)
 730         | (t != NULL ? 1 : 0))
 731      : 8);
 732
 733   for (;; match_first += incr)
 734     {
 735       err = REG_NOMATCH;
 736       if (match_first < left_lim || right_lim < match_first)
 737         goto free_return;
 738
 739       /* Advance as rapidly as possible through the string, until we
 740          find a plausible place to start matching.  This may be done
 741          with varying efficiency, so there are various possibilities:
 742          only the most common of them are specialized, in order to
 743          save on code size.  We use a switch statement for speed.  */
 744       switch (match_kind)
 745         {
 746         case 8:
 747           /* No fastmap.  */
 748           break;
 749
 750         case 7:
 751           /* Fastmap with single-byte translation, match forward.  */
 752           while (BE (match_first < right_lim, 1)
 753                  && !fastmap[t[(unsigned char) string[match_first]]])
 754             ++match_first;
 755           goto forward_match_found_start_or_reached_end;
 756
 757         case 6:
 758           /* Fastmap without translation, match forward.  */
 759           while (BE (match_first < right_lim, 1)
 760                  && !fastmap[(unsigned char) string[match_first]])
 761             ++match_first;
 762
 763         forward_match_found_start_or_reached_end:
 764           if (BE (match_first == right_lim, 0))
 765             {
 766               ch = match_first >= length
 767                        ? 0 : (unsigned char) string[match_first];
 768               if (!fastmap[t ? t[ch] : ch])
 769                 goto free_return;
 770             }
 771           break;
 772
 773         case 4:
 774         case 5:
 775           /* Fastmap without multi-byte translation, match backwards.  */
 776           while (match_first >= left_lim)
 777             {
 778               ch = match_first >= length
 779                        ? 0 : (unsigned char) string[match_first];
 780               if (fastmap[t ? t[ch] : ch])
 781                 break;
 782               --match_first;
 783             }
 784           if (match_first < left_lim)
 785             goto free_return;
 786           break;
 787
 788         default:
 789           /* In this case, we can't determine easily the current byte,
 790              since it might be a component byte of a multibyte
 791              character.  Then we use the constructed buffer instead.  */
 792           for (;;)
 793             {
 794               /* If MATCH_FIRST is out of the valid range, reconstruct the
 795                  buffers.  */
 796               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 797               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 798                 {
 799                   err = re_string_reconstruct (&mctx.input, match_first,
 800                                                eflags);
 801                   if (BE (err != REG_NOERROR, 0))
 802                     goto free_return;
 803
 804                   offset = match_first - mctx.input.raw_mbs_idx;
 805                 }
 806               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 807                  Note that MATCH_FIRST must not be smaller than 0.  */
 808               ch = (match_first >= length
 809                     ? 0 : re_string_byte_at (&mctx.input, offset));
 810               if (fastmap[ch])
 811                 break;
 812               match_first += incr;
 813               if (match_first < left_lim || match_first > right_lim)
 814                 {
 815                   err = REG_NOMATCH;
 816                   goto free_return;
 817                 }
 818             }
 819           break;
 820         }
 821
 822       /* Reconstruct the buffers so that the matcher can assume that
 823          the matching starts from the beginning of the buffer.  */
 824       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 825       if (BE (err != REG_NOERROR, 0))
 826         goto free_return;
 827
 828 #ifdef RE_ENABLE_I18N
 829      /* Don't consider this char as a possible match start if it part,
 830         yet isn't the head, of a multibyte character.  */
 831       if (!sb && !re_string_first_byte (&mctx.input, 0))
 832         continue;
 833 #endif
 834
 835       /* It seems to be appropriate one, then use the matcher.  */
 836       /* We assume that the matching starts from 0.  */
 837       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 838       match_last = check_matching (&mctx, fl_longest_match,
 839                                    range >= 0 ? &match_first : NULL);
 840       if (match_last != -1)
 841         {
 842           if (BE (match_last == -2, 0))
 843             {
 844               err = REG_ESPACE;
 845               goto free_return;
 846             }
 847           else
 848             {
 849               mctx.match_last = match_last;
 850               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 851                 {
 852                   re_dfastate_t *pstate = mctx.state_log[match_last];
 853                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 854                                                              match_last);
 855                 }
 856               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 857                   || dfa->nbackref)
 858                 {
 859                   err = prune_impossible_nodes (&mctx);
 860                   if (err == REG_NOERROR)
 861                     break;
 862                   if (BE (err != REG_NOMATCH, 0))
 863                     goto free_return;
 864                   match_last = -1;
 865                 }
 866               else
 867                 break; /* We found a match.  */
 868             }
 869         }
 870
 871       match_ctx_clean (&mctx);
 872     }
 873
 874 #ifdef DEBUG
 875   assert (match_last != -1);
 876   assert (err == REG_NOERROR);
 877 #endif
 878
 879   /* Set pmatch[] if we need.  */
 880   if (nmatch > 0)
 881     {
 882       int reg_idx;
 883
 884       /* Initialize registers.  */
 885       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 886         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 887
 888       /* Set the points where matching start/end.  */
 889       pmatch[0].rm_so = 0;
 890       pmatch[0].rm_eo = mctx.match_last;
 891
 892       if (!preg->no_sub && nmatch > 1)
 893         {
 894           err = set_regs (preg, &mctx, nmatch, pmatch,
 895                           dfa->has_plural_match && dfa->nbackref > 0);
 896           if (BE (err != REG_NOERROR, 0))
 897             goto free_return;
 898         }
 899
 900       /* At last, add the offset to the each registers, since we slided
 901          the buffers so that we could assume that the matching starts
 902          from 0.  */
 903       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 904         if (pmatch[reg_idx].rm_so != -1)
 905           {
 906 #ifdef RE_ENABLE_I18N
 907             if (BE (mctx.input.offsets_needed != 0, 0))
 908               {
 909                 pmatch[reg_idx].rm_so =
 910                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 911                    ? mctx.input.valid_raw_len
 912                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 913                 pmatch[reg_idx].rm_eo =
 914                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 915                    ? mctx.input.valid_raw_len
 916                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 917               }
 918 #else
 919             assert (mctx.input.offsets_needed == 0);
 920 #endif
 921             pmatch[reg_idx].rm_so += match_first;
 922             pmatch[reg_idx].rm_eo += match_first;
 923           }
 924       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 925         {
 926           pmatch[nmatch + reg_idx].rm_so = -1;
 927           pmatch[nmatch + reg_idx].rm_eo = -1;
 928         }
 929
 930       if (dfa->subexp_map)
 931         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 932           if (dfa->subexp_map[reg_idx] != reg_idx)
 933             {
 934               pmatch[reg_idx + 1].rm_so
 935                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 936               pmatch[reg_idx + 1].rm_eo
 937                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 938             }
 939     }
 940
 941  free_return:
 942   re_free (mctx.state_log);
 943   if (dfa->nbackref)
 944     match_ctx_free (&mctx);
 945   re_string_destruct (&mctx.input);
 946   return err;
 947 }
 948
 949 static reg_errcode_t
 950 prune_impossible_nodes (mctx)
 951      re_match_context_t *mctx;
 952 {
 953   const re_dfa_t *const dfa = mctx->dfa;
 954   int halt_node, match_last;
 955   reg_errcode_t ret;
 956   re_dfastate_t **sifted_states;
 957   re_dfastate_t **lim_states = NULL;
 958   re_sift_context_t sctx;
 959 #ifdef DEBUG
 960   assert (mctx->state_log != NULL);
 961 #endif
 962   match_last = mctx->match_last;
 963   halt_node = mctx->last_node;
 964
 965   /* Avoid overflow.  */
 966   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 967     return REG_ESPACE;
 968
 969   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 970   if (BE (sifted_states == NULL, 0))
 971     {
 972       ret = REG_ESPACE;
 973       goto free_return;
 974     }
 975   if (dfa->nbackref)
 976     {
 977       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 978       if (BE (lim_states == NULL, 0))
 979         {
 980           ret = REG_ESPACE;
 981           goto free_return;
 982         }
 983       while (1)
 984         {
 985           memset (lim_states, '\0',
 986                   sizeof (re_dfastate_t *) * (match_last + 1));
 987           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 988                          match_last);
 989           ret = sift_states_backward (mctx, &sctx);
 990           re_node_set_free (&sctx.limits);
 991           if (BE (ret != REG_NOERROR, 0))
 992               goto free_return;
 993           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 994             break;
 995           do
 996             {
 997               --match_last;
 998               if (match_last < 0)
 999                 {
1000                   ret = REG_NOMATCH;
1001                   goto free_return;
1002                 }
1003             } while (mctx->state_log[match_last] == NULL
1004                      || !mctx->state_log[match_last]->halt);
1005           halt_node = check_halt_state_context (mctx,
1006                                                 mctx->state_log[match_last],
1007                                                 match_last);
1008         }
1009       ret = merge_state_array (dfa, sifted_states, lim_states,
1010                                match_last + 1);
1011       re_free (lim_states);
1012       lim_states = NULL;
1013       if (BE (ret != REG_NOERROR, 0))
1014         goto free_return;
1015     }
1016   else
1017     {
1018       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1019       ret = sift_states_backward (mctx, &sctx);
1020       re_node_set_free (&sctx.limits);
1021       if (BE (ret != REG_NOERROR, 0))
1022         goto free_return;
1023       if (sifted_states[0] == NULL)
1024         {
1025           ret = REG_NOMATCH;
1026           goto free_return;
1027         }
1028     }
1029   re_free (mctx->state_log);
1030   mctx->state_log = sifted_states;
1031   sifted_states = NULL;
1032   mctx->last_node = halt_node;
1033   mctx->match_last = match_last;
1034   ret = REG_NOERROR;
1035  free_return:
1036   re_free (sifted_states);
1037   re_free (lim_states);
1038   return ret;
1039 }
1040
1041 /* Acquire an initial state and return it.
1042    We must select appropriate initial state depending on the context,
1043    since initial states may have constraints like "\<", "^", etc..  */
1044
1045 static inline re_dfastate_t *
1046 __attribute ((always_inline)) internal_function
1047 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1048                             int idx)
1049 {
1050   const re_dfa_t *const dfa = mctx->dfa;
1051   if (dfa->init_state->has_constraint)
1052     {
1053       unsigned int context;
1054       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1055       if (IS_WORD_CONTEXT (context))
1056         return dfa->init_state_word;
1057       else if (IS_ORDINARY_CONTEXT (context))
1058         return dfa->init_state;
1059       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1060         return dfa->init_state_begbuf;
1061       else if (IS_NEWLINE_CONTEXT (context))
1062         return dfa->init_state_nl;
1063       else if (IS_BEGBUF_CONTEXT (context))
1064         {
1065           /* It is relatively rare case, then calculate on demand.  */
1066           return re_acquire_state_context (err, dfa,
1067                                            dfa->init_state->entrance_nodes,
1068                                            context);
1069         }
1070       else
1071         /* Must not happen?  */
1072         return dfa->init_state;
1073     }
1074   else
1075     return dfa->init_state;
1076 }
1077
1078 /* Check whether the regular expression match input string INPUT or not,
1079    and return the index where the matching end, return -1 if not match,
1080    or return -2 in case of an error.
1081    FL_LONGEST_MATCH means we want the POSIX longest matching.
1082    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1083    next place where we may want to try matching.
1084    Note that the matcher assume that the maching starts from the current
1085    index of the buffer.  */
1086
1087 static int
1088 internal_function
1089 check_matching (re_match_context_t *mctx, int fl_longest_match,
1090                 int *p_match_first)
1091 {
1092   const re_dfa_t *const dfa = mctx->dfa;
1093   reg_errcode_t err;
1094   int match = 0;
1095   int match_last = -1;
1096   int cur_str_idx = re_string_cur_idx (&mctx->input);
1097   re_dfastate_t *cur_state;
1098   int at_init_state = p_match_first != NULL;
1099   int next_start_idx = cur_str_idx;
1100
1101   err = REG_NOERROR;
1102   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1103   /* An initial state must not be NULL (invalid).  */
1104   if (BE (cur_state == NULL, 0))
1105     {
1106       assert (err == REG_ESPACE);
1107       return -2;
1108     }
1109
1110   if (mctx->state_log != NULL)
1111     {
1112       mctx->state_log[cur_str_idx] = cur_state;
1113
1114       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1115          later.  E.g. Processing back references.  */
1116       if (BE (dfa->nbackref, 0))
1117         {
1118           at_init_state = 0;
1119           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1120           if (BE (err != REG_NOERROR, 0))
1121             return err;
1122
1123           if (cur_state->has_backref)
1124             {
1125               err = transit_state_bkref (mctx, &cur_state->nodes);
1126               if (BE (err != REG_NOERROR, 0))
1127                 return err;
1128             }
1129         }
1130     }
1131
1132   /* If the RE accepts NULL string.  */
1133   if (BE (cur_state->halt, 0))
1134     {
1135       if (!cur_state->has_constraint
1136           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1137         {
1138           if (!fl_longest_match)
1139             return cur_str_idx;
1140           else
1141             {
1142               match_last = cur_str_idx;
1143               match = 1;
1144             }
1145         }
1146     }
1147
1148   while (!re_string_eoi (&mctx->input))
1149     {
1150       re_dfastate_t *old_state = cur_state;
1151       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1152
1153       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1154           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1155               && mctx->input.valid_len < mctx->input.len))
1156         {
1157           err = extend_buffers (mctx);
1158           if (BE (err != REG_NOERROR, 0))
1159             {
1160               assert (err == REG_ESPACE);
1161               return -2;
1162             }
1163         }
1164
1165       cur_state = transit_state (&err, mctx, cur_state);
1166       if (mctx->state_log != NULL)
1167         cur_state = merge_state_with_log (&err, mctx, cur_state);
1168
1169       if (cur_state == NULL)
1170         {
1171           /* Reached the invalid state or an error.  Try to recover a valid
1172              state using the state log, if available and if we have not
1173              already found a valid (even if not the longest) match.  */
1174           if (BE (err != REG_NOERROR, 0))
1175             return -2;
1176
1177           if (mctx->state_log == NULL
1178               || (match && !fl_longest_match)
1179               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1180             break;
1181         }
1182
1183       if (BE (at_init_state, 0))
1184         {
1185           if (old_state == cur_state)
1186             next_start_idx = next_char_idx;
1187           else
1188             at_init_state = 0;
1189         }
1190
1191       if (cur_state->halt)
1192         {
1193           /* Reached a halt state.
1194              Check the halt state can satisfy the current context.  */
1195           if (!cur_state->has_constraint
1196               || check_halt_state_context (mctx, cur_state,
1197                                            re_string_cur_idx (&mctx->input)))
1198             {
1199               /* We found an appropriate halt state.  */
1200               match_last = re_string_cur_idx (&mctx->input);
1201               match = 1;
1202
1203               /* We found a match, do not modify match_first below.  */
1204               p_match_first = NULL;
1205               if (!fl_longest_match)
1206                 break;
1207             }
1208         }
1209     }
1210
1211   if (p_match_first)
1212     *p_match_first += next_start_idx;
1213
1214   return match_last;
1215 }
1216
1217 /* Check NODE match the current context.  */
1218
1219 static int
1220 internal_function
1221 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1222 {
1223   re_token_type_t type = dfa->nodes[node].type;
1224   unsigned int constraint = dfa->nodes[node].constraint;
1225   if (type != END_OF_RE)
1226     return 0;
1227   if (!constraint)
1228     return 1;
1229   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1230     return 0;
1231   return 1;
1232 }
1233
1234 /* Check the halt state STATE match the current context.
1235    Return 0 if not match, if the node, STATE has, is a halt node and
1236    match the context, return the node.  */
1237
1238 static int
1239 internal_function
1240 check_halt_state_context (const re_match_context_t *mctx,
1241                           const re_dfastate_t *state, int idx)
1242 {
1243   int i;
1244   unsigned int context;
1245 #ifdef DEBUG
1246   assert (state->halt);
1247 #endif
1248   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1249   for (i = 0; i < state->nodes.nelem; ++i)
1250     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1251       return state->nodes.elems[i];
1252   return 0;
1253 }
1254
1255 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1256    corresponding to the DFA).
1257    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1258    of errors.  */
1259
1260 static int
1261 internal_function
1262 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1263                    int *pidx, int node, re_node_set *eps_via_nodes,
1264                    struct re_fail_stack_t *fs)
1265 {
1266   const re_dfa_t *const dfa = mctx->dfa;
1267   int i, err;
1268   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1269     {
1270       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1271       re_node_set *edests = &dfa->edests[node];
1272       int dest_node;
1273       err = re_node_set_insert (eps_via_nodes, node);
1274       if (BE (err < 0, 0))
1275         return -2;
1276       /* Pick up a valid destination, or return -1 if none is found.  */
1277       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1278         {
1279           int candidate = edests->elems[i];
1280           if (!re_node_set_contains (cur_nodes, candidate))
1281             continue;
1282           if (dest_node == -1)
1283             dest_node = candidate;
1284
1285           else
1286             {
1287               /* In order to avoid infinite loop like "(a*)*", return the second
1288                  epsilon-transition if the first was already considered.  */
1289               if (re_node_set_contains (eps_via_nodes, dest_node))
1290                 return candidate;
1291
1292               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1293               else if (fs != NULL
1294                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1295                                            eps_via_nodes))
1296                 return -2;
1297
1298               /* We know we are going to exit.  */
1299               break;
1300             }
1301         }
1302       return dest_node;
1303     }
1304   else
1305     {
1306       int naccepted = 0;
1307       re_token_type_t type = dfa->nodes[node].type;
1308
1309 #ifdef RE_ENABLE_I18N
1310       if (dfa->nodes[node].accept_mb)
1311         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1312       else
1313 #endif /* RE_ENABLE_I18N */
1314       if (type == OP_BACK_REF)
1315         {
1316           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1317           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1318           if (fs != NULL)
1319             {
1320               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1321                 return -1;
1322               else if (naccepted)
1323                 {
1324                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1325                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1326                               naccepted) != 0)
1327                     return -1;
1328                 }
1329             }
1330
1331           if (naccepted == 0)
1332             {
1333               int dest_node;
1334               err = re_node_set_insert (eps_via_nodes, node);
1335               if (BE (err < 0, 0))
1336                 return -2;
1337               dest_node = dfa->edests[node].elems[0];
1338               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1339                                         dest_node))
1340                 return dest_node;
1341             }
1342         }
1343
1344       if (naccepted != 0
1345           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1346         {
1347           int dest_node = dfa->nexts[node];
1348           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1349           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1350                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1351                                                dest_node)))
1352             return -1;
1353           re_node_set_empty (eps_via_nodes);
1354           return dest_node;
1355         }
1356     }
1357   return -1;
1358 }
1359
1360 static reg_errcode_t
1361 internal_function
1362 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1363                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1364 {
1365   reg_errcode_t err;
1366   int num = fs->num++;
1367   if (fs->num == fs->alloc)
1368     {
1369       struct re_fail_stack_ent_t *new_array;
1370       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1371                                        * fs->alloc * 2));
1372       if (new_array == NULL)
1373         return REG_ESPACE;
1374       fs->alloc *= 2;
1375       fs->stack = new_array;
1376     }
1377   fs->stack[num].idx = str_idx;
1378   fs->stack[num].node = dest_node;
1379   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1380   if (fs->stack[num].regs == NULL)
1381     return REG_ESPACE;
1382   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1383   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1384   return err;
1385 }
1386
1387 static int
1388 internal_function
1389 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1390                 regmatch_t *regs, re_node_set *eps_via_nodes)
1391 {
1392   int num = --fs->num;
1393   assert (num >= 0);
1394   *pidx = fs->stack[num].idx;
1395   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1396   re_node_set_free (eps_via_nodes);
1397   re_free (fs->stack[num].regs);
1398   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1399   return fs->stack[num].node;
1400 }
1401
1402 /* Set the positions where the subexpressions are starts/ends to registers
1403    PMATCH.
1404    Note: We assume that pmatch[0] is already set, and
1405    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1406
1407 static reg_errcode_t
1408 internal_function
1409 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1410           regmatch_t *pmatch, int fl_backtrack)
1411 {
1412   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1413   int idx, cur_node;
1414   re_node_set eps_via_nodes;
1415   struct re_fail_stack_t *fs;
1416   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1417   regmatch_t *prev_idx_match;
1418   int prev_idx_match_malloced = 0;
1419
1420 #ifdef DEBUG
1421   assert (nmatch > 1);
1422   assert (mctx->state_log != NULL);
1423 #endif
1424   if (fl_backtrack)
1425     {
1426       fs = &fs_body;
1427       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1428       if (fs->stack == NULL)
1429         return REG_ESPACE;
1430     }
1431   else
1432     fs = NULL;
1433
1434   cur_node = dfa->init_node;
1435   re_node_set_init_empty (&eps_via_nodes);
1436
1437 #ifdef HAVE_ALLOCA
1438   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1439     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1440   else
1441 #endif
1442     {
1443       prev_idx_match = re_malloc (regmatch_t, nmatch);
1444       if (prev_idx_match == NULL)
1445         {
1446           free_fail_stack_return (fs);
1447           return REG_ESPACE;
1448         }
1449       prev_idx_match_malloced = 1;
1450     }
1451   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1452
1453   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1454     {
1455       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1456
1457       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1458         {
1459           int reg_idx;
1460           if (fs)
1461             {
1462               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1463                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1464                   break;
1465               if (reg_idx == nmatch)
1466                 {
1467                   re_node_set_free (&eps_via_nodes);
1468                   if (prev_idx_match_malloced)
1469                     re_free (prev_idx_match);
1470                   return free_fail_stack_return (fs);
1471                 }
1472               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1473                                          &eps_via_nodes);
1474             }
1475           else
1476             {
1477               re_node_set_free (&eps_via_nodes);
1478               if (prev_idx_match_malloced)
1479                 re_free (prev_idx_match);
1480               return REG_NOERROR;
1481             }
1482         }
1483
1484       /* Proceed to next node.  */
1485       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1486                                     &eps_via_nodes, fs);
1487
1488       if (BE (cur_node < 0, 0))
1489         {
1490           if (BE (cur_node == -2, 0))
1491             {
1492               re_node_set_free (&eps_via_nodes);
1493               if (prev_idx_match_malloced)
1494                 re_free (prev_idx_match);
1495               free_fail_stack_return (fs);
1496               return REG_ESPACE;
1497             }
1498           if (fs)
1499             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1500                                        &eps_via_nodes);
1501           else
1502             {
1503               re_node_set_free (&eps_via_nodes);
1504               if (prev_idx_match_malloced)
1505                 re_free (prev_idx_match);
1506               return REG_NOMATCH;
1507             }
1508         }
1509     }
1510   re_node_set_free (&eps_via_nodes);
1511   if (prev_idx_match_malloced)
1512     re_free (prev_idx_match);
1513   return free_fail_stack_return (fs);
1514 }
1515
1516 static reg_errcode_t
1517 internal_function
1518 free_fail_stack_return (struct re_fail_stack_t *fs)
1519 {
1520   if (fs)
1521     {
1522       int fs_idx;
1523       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1524         {
1525           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1526           re_free (fs->stack[fs_idx].regs);
1527         }
1528       re_free (fs->stack);
1529     }
1530   return REG_NOERROR;
1531 }
1532
1533 static void
1534 internal_function
1535 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1536              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1537 {
1538   int type = dfa->nodes[cur_node].type;
1539   if (type == OP_OPEN_SUBEXP)
1540     {
1541       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1542
1543       /* We are at the first node of this sub expression.  */
1544       if (reg_num < nmatch)
1545         {
1546           pmatch[reg_num].rm_so = cur_idx;
1547           pmatch[reg_num].rm_eo = -1;
1548         }
1549     }
1550   else if (type == OP_CLOSE_SUBEXP)
1551     {
1552       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1553       if (reg_num < nmatch)
1554         {
1555           /* We are at the last node of this sub expression.  */
1556           if (pmatch[reg_num].rm_so < cur_idx)
1557             {
1558               pmatch[reg_num].rm_eo = cur_idx;
1559               /* This is a non-empty match or we are not inside an optional
1560                  subexpression.  Accept this right away.  */
1561               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1562             }
1563           else
1564             {
1565               if (dfa->nodes[cur_node].opt_subexp
1566                   && prev_idx_match[reg_num].rm_so != -1)
1567                 /* We transited through an empty match for an optional
1568                    subexpression, like (a?)*, and this is not the subexp's
1569                    first match.  Copy back the old content of the registers
1570                    so that matches of an inner subexpression are undone as
1571                    well, like in ((a?))*.  */
1572                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1573               else
1574                 /* We completed a subexpression, but it may be part of
1575                    an optional one, so do not update PREV_IDX_MATCH.  */
1576                 pmatch[reg_num].rm_eo = cur_idx;
1577             }
1578         }
1579     }
1580 }
1581
1582 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1583    and sift the nodes in each states according to the following rules.
1584    Updated state_log will be wrote to STATE_LOG.
1585
1586    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1587      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1588         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1589         the LAST_NODE, we throw away the node `a'.
1590      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1591         string `s' and transit to `b':
1592         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1593            away the node `a'.
1594         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1595             thrown away, we throw away the node `a'.
1596      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1597         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1598            node `a'.
1599         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1600             we throw away the node `a'.  */
1601
1602 #define STATE_NODE_CONTAINS(state,node) \
1603   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1604
1605 static reg_errcode_t
1606 internal_function
1607 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1608 {
1609   reg_errcode_t err;
1610   int null_cnt = 0;
1611   int str_idx = sctx->last_str_idx;
1612   re_node_set cur_dest;
1613
1614 #ifdef DEBUG
1615   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1616 #endif
1617
1618   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1619      transit to the last_node and the last_node itself.  */
1620   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1621   if (BE (err != REG_NOERROR, 0))
1622     return err;
1623   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1624   if (BE (err != REG_NOERROR, 0))
1625     goto free_return;
1626
1627   /* Then check each states in the state_log.  */
1628   while (str_idx > 0)
1629     {
1630       /* Update counters.  */
1631       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1632       if (null_cnt > mctx->max_mb_elem_len)
1633         {
1634           memset (sctx->sifted_states, '\0',
1635                   sizeof (re_dfastate_t *) * str_idx);
1636           re_node_set_free (&cur_dest);
1637           return REG_NOERROR;
1638         }
1639       re_node_set_empty (&cur_dest);
1640       --str_idx;
1641
1642       if (mctx->state_log[str_idx])
1643         {
1644           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1645           if (BE (err != REG_NOERROR, 0))
1646             goto free_return;
1647         }
1648
1649       /* Add all the nodes which satisfy the following conditions:
1650          - It can epsilon transit to a node in CUR_DEST.
1651          - It is in CUR_SRC.
1652          And update state_log.  */
1653       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1654       if (BE (err != REG_NOERROR, 0))
1655         goto free_return;
1656     }
1657   err = REG_NOERROR;
1658  free_return:
1659   re_node_set_free (&cur_dest);
1660   return err;
1661 }
1662
1663 static reg_errcode_t
1664 internal_function
1665 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1666                      int str_idx, re_node_set *cur_dest)
1667 {
1668   const re_dfa_t *const dfa = mctx->dfa;
1669   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1670   int i;
1671
1672   /* Then build the next sifted state.
1673      We build the next sifted state on `cur_dest', and update
1674      `sifted_states[str_idx]' with `cur_dest'.
1675      Note:
1676      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1677      `cur_src' points the node_set of the old `state_log[str_idx]'
1678      (with the epsilon nodes pre-filtered out).  */
1679   for (i = 0; i < cur_src->nelem; i++)
1680     {
1681       int prev_node = cur_src->elems[i];
1682       int naccepted = 0;
1683       int ret;
1684
1685 #ifdef DEBUG
1686       re_token_type_t type = dfa->nodes[prev_node].type;
1687       assert (!IS_EPSILON_NODE (type));
1688 #endif
1689 #ifdef RE_ENABLE_I18N
1690       /* If the node may accept `multi byte'.  */
1691       if (dfa->nodes[prev_node].accept_mb)
1692         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1693                                          str_idx, sctx->last_str_idx);
1694 #endif /* RE_ENABLE_I18N */
1695
1696       /* We don't check backreferences here.
1697          See update_cur_sifted_state().  */
1698       if (!naccepted
1699           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1700           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1701                                   dfa->nexts[prev_node]))
1702         naccepted = 1;
1703
1704       if (naccepted == 0)
1705         continue;
1706
1707       if (sctx->limits.nelem)
1708         {
1709           int to_idx = str_idx + naccepted;
1710           if (check_dst_limits (mctx, &sctx->limits,
1711                                 dfa->nexts[prev_node], to_idx,
1712                                 prev_node, str_idx))
1713             continue;
1714         }
1715       ret = re_node_set_insert (cur_dest, prev_node);
1716       if (BE (ret == -1, 0))
1717         return REG_ESPACE;
1718     }
1719
1720   return REG_NOERROR;
1721 }
1722
1723 /* Helper functions.  */
1724
1725 static reg_errcode_t
1726 internal_function
1727 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1728 {
1729   int top = mctx->state_log_top;
1730
1731   if (next_state_log_idx >= mctx->input.bufs_len
1732       || (next_state_log_idx >= mctx->input.valid_len
1733           && mctx->input.valid_len < mctx->input.len))
1734     {
1735       reg_errcode_t err;
1736       err = extend_buffers (mctx);
1737       if (BE (err != REG_NOERROR, 0))
1738         return err;
1739     }
1740
1741   if (top < next_state_log_idx)
1742     {
1743       memset (mctx->state_log + top + 1, '\0',
1744               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1745       mctx->state_log_top = next_state_log_idx;
1746     }
1747   return REG_NOERROR;
1748 }
1749
1750 static reg_errcode_t
1751 internal_function
1752 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1753                    re_dfastate_t **src, int num)
1754 {
1755   int st_idx;
1756   reg_errcode_t err;
1757   for (st_idx = 0; st_idx < num; ++st_idx)
1758     {
1759       if (dst[st_idx] == NULL)
1760         dst[st_idx] = src[st_idx];
1761       else if (src[st_idx] != NULL)
1762         {
1763           re_node_set merged_set;
1764           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1765                                         &src[st_idx]->nodes);
1766           if (BE (err != REG_NOERROR, 0))
1767             return err;
1768           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1769           re_node_set_free (&merged_set);
1770           if (BE (err != REG_NOERROR, 0))
1771             return err;
1772         }
1773     }
1774   return REG_NOERROR;
1775 }
1776
1777 static reg_errcode_t
1778 internal_function
1779 update_cur_sifted_state (const re_match_context_t *mctx,
1780                          re_sift_context_t *sctx, int str_idx,
1781                          re_node_set *dest_nodes)
1782 {
1783   const re_dfa_t *const dfa = mctx->dfa;
1784   reg_errcode_t err = REG_NOERROR;
1785   const re_node_set *candidates;
1786   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1787                 : &mctx->state_log[str_idx]->nodes);
1788
1789   if (dest_nodes->nelem == 0)
1790     sctx->sifted_states[str_idx] = NULL;
1791   else
1792     {
1793       if (candidates)
1794         {
1795           /* At first, add the nodes which can epsilon transit to a node in
1796              DEST_NODE.  */
1797           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1798           if (BE (err != REG_NOERROR, 0))
1799             return err;
1800
1801           /* Then, check the limitations in the current sift_context.  */
1802           if (sctx->limits.nelem)
1803             {
1804               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1805                                          mctx->bkref_ents, str_idx);
1806               if (BE (err != REG_NOERROR, 0))
1807                 return err;
1808             }
1809         }
1810
1811       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1812       if (BE (err != REG_NOERROR, 0))
1813         return err;
1814     }
1815
1816   if (candidates && mctx->state_log[str_idx]->has_backref)
1817     {
1818       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1819       if (BE (err != REG_NOERROR, 0))
1820         return err;
1821     }
1822   return REG_NOERROR;
1823 }
1824
1825 static reg_errcode_t
1826 internal_function
1827 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1828                        const re_node_set *candidates)
1829 {
1830   reg_errcode_t err = REG_NOERROR;
1831   int i;
1832
1833   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1834   if (BE (err != REG_NOERROR, 0))
1835     return err;
1836
1837   if (!state->inveclosure.alloc)
1838     {
1839       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1840       if (BE (err != REG_NOERROR, 0))
1841         return REG_ESPACE;
1842       for (i = 0; i < dest_nodes->nelem; i++)
1843         {
1844           err = re_node_set_merge (&state->inveclosure,
1845                                    dfa->inveclosures + dest_nodes->elems[i]);
1846           if (BE (err != REG_NOERROR, 0))
1847             return REG_ESPACE;
1848         }
1849     }
1850   return re_node_set_add_intersect (dest_nodes, candidates,
1851                                     &state->inveclosure);
1852 }
1853
1854 static reg_errcode_t
1855 internal_function
1856 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1857                        const re_node_set *candidates)
1858 {
1859     int ecl_idx;
1860     reg_errcode_t err;
1861     re_node_set *inv_eclosure = dfa->inveclosures + node;
1862     re_node_set except_nodes;
1863     re_node_set_init_empty (&except_nodes);
1864     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1865       {
1866         int cur_node = inv_eclosure->elems[ecl_idx];
1867         if (cur_node == node)
1868           continue;
1869         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1870           {
1871             int edst1 = dfa->edests[cur_node].elems[0];
1872             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1873                          ? dfa->edests[cur_node].elems[1] : -1);
1874             if ((!re_node_set_contains (inv_eclosure, edst1)
1875                  && re_node_set_contains (dest_nodes, edst1))
1876                 || (edst2 > 0
1877                     && !re_node_set_contains (inv_eclosure, edst2)
1878                     && re_node_set_contains (dest_nodes, edst2)))
1879               {
1880                 err = re_node_set_add_intersect (&except_nodes, candidates,
1881                                                  dfa->inveclosures + cur_node);
1882                 if (BE (err != REG_NOERROR, 0))
1883                   {
1884                     re_node_set_free (&except_nodes);
1885                     return err;
1886                   }
1887               }
1888           }
1889       }
1890     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1891       {
1892         int cur_node = inv_eclosure->elems[ecl_idx];
1893         if (!re_node_set_contains (&except_nodes, cur_node))
1894           {
1895             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1896             re_node_set_remove_at (dest_nodes, idx);
1897           }
1898       }
1899     re_node_set_free (&except_nodes);
1900     return REG_NOERROR;
1901 }
1902
1903 static int
1904 internal_function
1905 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1906                   int dst_node, int dst_idx, int src_node, int src_idx)
1907 {
1908   const re_dfa_t *const dfa = mctx->dfa;
1909   int lim_idx, src_pos, dst_pos;
1910
1911   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1912   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1913   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1914     {
1915       int subexp_idx;
1916       struct re_backref_cache_entry *ent;
1917       ent = mctx->bkref_ents + limits->elems[lim_idx];
1918       subexp_idx = dfa->nodes[ent->node].opr.idx;
1919
1920       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1921                                            subexp_idx, dst_node, dst_idx,
1922                                            dst_bkref_idx);
1923       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1924                                            subexp_idx, src_node, src_idx,
1925                                            src_bkref_idx);
1926
1927       /* In case of:
1928          <src> <dst> ( <subexp> )
1929          ( <subexp> ) <src> <dst>
1930          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1931       if (src_pos == dst_pos)
1932         continue; /* This is unrelated limitation.  */
1933       else
1934         return 1;
1935     }
1936   return 0;
1937 }
1938
1939 static int
1940 internal_function
1941 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1942                              int subexp_idx, int from_node, int bkref_idx)
1943 {
1944   const re_dfa_t *const dfa = mctx->dfa;
1945   const re_node_set *eclosures = dfa->eclosures + from_node;
1946   int node_idx;
1947
1948   /* Else, we are on the boundary: examine the nodes on the epsilon
1949      closure.  */
1950   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1951     {
1952       int node = eclosures->elems[node_idx];
1953       switch (dfa->nodes[node].type)
1954         {
1955         case OP_BACK_REF:
1956           if (bkref_idx != -1)
1957             {
1958               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1959               do
1960                 {
1961                   int dst, cpos;
1962
1963                   if (ent->node != node)
1964                     continue;
1965
1966                   if (subexp_idx < BITSET_WORD_BITS
1967                       && !(ent->eps_reachable_subexps_map
1968                            & ((bitset_word_t) 1 << subexp_idx)))
1969                     continue;
1970
1971                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1972                      OP_CLOSE_SUBEXP cases below.  But, if the
1973                      destination node is the same node as the source
1974                      node, don't recurse because it would cause an
1975                      infinite loop: a regex that exhibits this behavior
1976                      is ()\1*\1*  */
1977                   dst = dfa->edests[node].elems[0];
1978                   if (dst == from_node)
1979                     {
1980                       if (boundaries & 1)
1981                         return -1;
1982                       else /* if (boundaries & 2) */
1983                         return 0;
1984                     }
1985
1986                   cpos =
1987                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1988                                                  dst, bkref_idx);
1989                   if (cpos == -1 /* && (boundaries & 1) */)
1990                     return -1;
1991                   if (cpos == 0 && (boundaries & 2))
1992                     return 0;
1993
1994                   if (subexp_idx < BITSET_WORD_BITS)
1995                     ent->eps_reachable_subexps_map
1996                       &= ~((bitset_word_t) 1 << subexp_idx);
1997                 }
1998               while (ent++->more);
1999             }
2000           break;
2001
2002         case OP_OPEN_SUBEXP:
2003           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2004             return -1;
2005           break;
2006
2007         case OP_CLOSE_SUBEXP:
2008           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2009             return 0;
2010           break;
2011
2012         default:
2013             break;
2014         }
2015     }
2016
2017   return (boundaries & 2) ? 1 : 0;
2018 }
2019
2020 static int
2021 internal_function
2022 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2023                            int subexp_idx, int from_node, int str_idx,
2024                            int bkref_idx)
2025 {
2026   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2027   int boundaries;
2028
2029   /* If we are outside the range of the subexpression, return -1 or 1.  */
2030   if (str_idx < lim->subexp_from)
2031     return -1;
2032
2033   if (lim->subexp_to < str_idx)
2034     return 1;
2035
2036   /* If we are within the subexpression, return 0.  */
2037   boundaries = (str_idx == lim->subexp_from);
2038   boundaries |= (str_idx == lim->subexp_to) << 1;
2039   if (boundaries == 0)
2040     return 0;
2041
2042   /* Else, examine epsilon closure.  */
2043   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2044                                       from_node, bkref_idx);
2045 }
2046
2047 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2048    which are against limitations from DEST_NODES. */
2049
2050 static reg_errcode_t
2051 internal_function
2052 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2053                      const re_node_set *candidates, re_node_set *limits,
2054                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2055 {
2056   reg_errcode_t err;
2057   int node_idx, lim_idx;
2058
2059   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2060     {
2061       int subexp_idx;
2062       struct re_backref_cache_entry *ent;
2063       ent = bkref_ents + limits->elems[lim_idx];
2064
2065       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2066         continue; /* This is unrelated limitation.  */
2067
2068       subexp_idx = dfa->nodes[ent->node].opr.idx;
2069       if (ent->subexp_to == str_idx)
2070         {
2071           int ops_node = -1;
2072           int cls_node = -1;
2073           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2074             {
2075               int node = dest_nodes->elems[node_idx];
2076               re_token_type_t type = dfa->nodes[node].type;
2077               if (type == OP_OPEN_SUBEXP
2078                   && subexp_idx == dfa->nodes[node].opr.idx)
2079                 ops_node = node;
2080               else if (type == OP_CLOSE_SUBEXP
2081                        && subexp_idx == dfa->nodes[node].opr.idx)
2082                 cls_node = node;
2083             }
2084
2085           /* Check the limitation of the open subexpression.  */
2086           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2087           if (ops_node >= 0)
2088             {
2089               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2090                                            candidates);
2091               if (BE (err != REG_NOERROR, 0))
2092                 return err;
2093             }
2094
2095           /* Check the limitation of the close subexpression.  */
2096           if (cls_node >= 0)
2097             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2098               {
2099                 int node = dest_nodes->elems[node_idx];
2100                 if (!re_node_set_contains (dfa->inveclosures + node,
2101                                            cls_node)
2102                     && !re_node_set_contains (dfa->eclosures + node,
2103                                               cls_node))
2104                   {
2105                     /* It is against this limitation.
2106                        Remove it form the current sifted state.  */
2107                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2108                                                  candidates);
2109                     if (BE (err != REG_NOERROR, 0))
2110                       return err;
2111                     --node_idx;
2112                   }
2113               }
2114         }
2115       else /* (ent->subexp_to != str_idx)  */
2116         {
2117           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2118             {
2119               int node = dest_nodes->elems[node_idx];
2120               re_token_type_t type = dfa->nodes[node].type;
2121               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2122                 {
2123                   if (subexp_idx != dfa->nodes[node].opr.idx)
2124                     continue;
2125                   /* It is against this limitation.
2126                      Remove it form the current sifted state.  */
2127                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2128                                                candidates);
2129                   if (BE (err != REG_NOERROR, 0))
2130                     return err;
2131                 }
2132             }
2133         }
2134     }
2135   return REG_NOERROR;
2136 }
2137
2138 static reg_errcode_t
2139 internal_function
2140 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2141                    int str_idx, const re_node_set *candidates)
2142 {
2143   const re_dfa_t *const dfa = mctx->dfa;
2144   reg_errcode_t err;
2145   int node_idx, node;
2146   re_sift_context_t local_sctx;
2147   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2148
2149   if (first_idx == -1)
2150     return REG_NOERROR;
2151
2152   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2153
2154   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2155     {
2156       int enabled_idx;
2157       re_token_type_t type;
2158       struct re_backref_cache_entry *entry;
2159       node = candidates->elems[node_idx];
2160       type = dfa->nodes[node].type;
2161       /* Avoid infinite loop for the REs like "()\1+".  */
2162       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2163         continue;
2164       if (type != OP_BACK_REF)
2165         continue;
2166
2167       entry = mctx->bkref_ents + first_idx;
2168       enabled_idx = first_idx;
2169       do
2170         {
2171           int subexp_len;
2172           int to_idx;
2173           int dst_node;
2174           int ret;
2175           re_dfastate_t *cur_state;
2176
2177           if (entry->node != node)
2178             continue;
2179           subexp_len = entry->subexp_to - entry->subexp_from;
2180           to_idx = str_idx + subexp_len;
2181           dst_node = (subexp_len ? dfa->nexts[node]
2182                       : dfa->edests[node].elems[0]);
2183
2184           if (to_idx > sctx->last_str_idx
2185               || sctx->sifted_states[to_idx] == NULL
2186               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2187               || check_dst_limits (mctx, &sctx->limits, node,
2188                                    str_idx, dst_node, to_idx))
2189             continue;
2190
2191           if (local_sctx.sifted_states == NULL)
2192             {
2193               local_sctx = *sctx;
2194               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2195               if (BE (err != REG_NOERROR, 0))
2196                 goto free_return;
2197             }
2198           local_sctx.last_node = node;
2199           local_sctx.last_str_idx = str_idx;
2200           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2201           if (BE (ret < 0, 0))
2202             {
2203               err = REG_ESPACE;
2204               goto free_return;
2205             }
2206           cur_state = local_sctx.sifted_states[str_idx];
2207           err = sift_states_backward (mctx, &local_sctx);
2208           if (BE (err != REG_NOERROR, 0))
2209             goto free_return;
2210           if (sctx->limited_states != NULL)
2211             {
2212               err = merge_state_array (dfa, sctx->limited_states,
2213                                        local_sctx.sifted_states,
2214                                        str_idx + 1);
2215               if (BE (err != REG_NOERROR, 0))
2216                 goto free_return;
2217             }
2218           local_sctx.sifted_states[str_idx] = cur_state;
2219           re_node_set_remove (&local_sctx.limits, enabled_idx);
2220
2221           /* mctx->bkref_ents may have changed, reload the pointer.  */
2222           entry = mctx->bkref_ents + enabled_idx;
2223         }
2224       while (enabled_idx++, entry++->more);
2225     }
2226   err = REG_NOERROR;
2227  free_return:
2228   if (local_sctx.sifted_states != NULL)
2229     {
2230       re_node_set_free (&local_sctx.limits);
2231     }
2232
2233   return err;
2234 }
2235
2236
2237 #ifdef RE_ENABLE_I18N
2238 static int
2239 internal_function
2240 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2241                      int node_idx, int str_idx, int max_str_idx)
2242 {
2243   const re_dfa_t *const dfa = mctx->dfa;
2244   int naccepted;
2245   /* Check the node can accept `multi byte'.  */
2246   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2247   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2248       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2249                             dfa->nexts[node_idx]))
2250     /* The node can't accept the `multi byte', or the
2251        destination was already thrown away, then the node
2252        could't accept the current input `multi byte'.   */
2253     naccepted = 0;
2254   /* Otherwise, it is sure that the node could accept
2255      `naccepted' bytes input.  */
2256   return naccepted;
2257 }
2258 #endif /* RE_ENABLE_I18N */
2259
2260 \f
2261 /* Functions for state transition.  */
2262
2263 /* Return the next state to which the current state STATE will transit by
2264    accepting the current input byte, and update STATE_LOG if necessary.
2265    If STATE can accept a multibyte char/collating element/back reference
2266    update the destination of STATE_LOG.  */
2267
2268 static re_dfastate_t *
2269 internal_function
2270 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2271                re_dfastate_t *state)
2272 {
2273   re_dfastate_t **trtable;
2274   unsigned char ch;
2275
2276 #ifdef RE_ENABLE_I18N
2277   /* If the current state can accept multibyte.  */
2278   if (BE (state->accept_mb, 0))
2279     {
2280       *err = transit_state_mb (mctx, state);
2281       if (BE (*err != REG_NOERROR, 0))
2282         return NULL;
2283     }
2284 #endif /* RE_ENABLE_I18N */
2285
2286   /* Then decide the next state with the single byte.  */
2287 #if 0
2288   if (0)
2289     /* don't use transition table  */
2290     return transit_state_sb (err, mctx, state);
2291 #endif
2292
2293   /* Use transition table  */
2294   ch = re_string_fetch_byte (&mctx->input);
2295   for (;;)
2296     {
2297       trtable = state->trtable;
2298       if (BE (trtable != NULL, 1))
2299         return trtable[ch];
2300
2301       trtable = state->word_trtable;
2302       if (BE (trtable != NULL, 1))
2303         {
2304           unsigned int context;
2305           context
2306             = re_string_context_at (&mctx->input,
2307                                     re_string_cur_idx (&mctx->input) - 1,
2308                                     mctx->eflags);
2309           if (IS_WORD_CONTEXT (context))
2310             return trtable[ch + SBC_MAX];
2311           else
2312             return trtable[ch];
2313         }
2314
2315       if (!build_trtable (mctx->dfa, state))
2316         {
2317           *err = REG_ESPACE;
2318           return NULL;
2319         }
2320
2321       /* Retry, we now have a transition table.  */
2322     }
2323 }
2324
2325 /* Update the state_log if we need */
2326 re_dfastate_t *
2327 internal_function
2328 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2329                       re_dfastate_t *next_state)
2330 {
2331   const re_dfa_t *const dfa = mctx->dfa;
2332   int cur_idx = re_string_cur_idx (&mctx->input);
2333
2334   if (cur_idx > mctx->state_log_top)
2335     {
2336       mctx->state_log[cur_idx] = next_state;
2337       mctx->state_log_top = cur_idx;
2338     }
2339   else if (mctx->state_log[cur_idx] == 0)
2340     {
2341       mctx->state_log[cur_idx] = next_state;
2342     }
2343   else
2344     {
2345       re_dfastate_t *pstate;
2346       unsigned int context;
2347       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2348       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2349          the destination of a multibyte char/collating element/
2350          back reference.  Then the next state is the union set of
2351          these destinations and the results of the transition table.  */
2352       pstate = mctx->state_log[cur_idx];
2353       log_nodes = pstate->entrance_nodes;
2354       if (next_state != NULL)
2355         {
2356           table_nodes = next_state->entrance_nodes;
2357           *err = re_node_set_init_union (&next_nodes, table_nodes,
2358                                              log_nodes);
2359           if (BE (*err != REG_NOERROR, 0))
2360             return NULL;
2361         }
2362       else
2363         next_nodes = *log_nodes;
2364       /* Note: We already add the nodes of the initial state,
2365          then we don't need to add them here.  */
2366
2367       context = re_string_context_at (&mctx->input,
2368                                       re_string_cur_idx (&mctx->input) - 1,
2369                                       mctx->eflags);
2370       next_state = mctx->state_log[cur_idx]
2371         = re_acquire_state_context (err, dfa, &next_nodes, context);
2372       /* We don't need to check errors here, since the return value of
2373          this function is next_state and ERR is already set.  */
2374
2375       if (table_nodes != NULL)
2376         re_node_set_free (&next_nodes);
2377     }
2378
2379   if (BE (dfa->nbackref, 0) && next_state != NULL)
2380     {
2381       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2382          later.  We must check them here, since the back references in the
2383          next state might use them.  */
2384       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2385                                         cur_idx);
2386       if (BE (*err != REG_NOERROR, 0))
2387         return NULL;
2388
2389       /* If the next state has back references.  */
2390       if (next_state->has_backref)
2391         {
2392           *err = transit_state_bkref (mctx, &next_state->nodes);
2393           if (BE (*err != REG_NOERROR, 0))
2394             return NULL;
2395           next_state = mctx->state_log[cur_idx];
2396         }
2397     }
2398
2399   return next_state;
2400 }
2401
2402 /* Skip bytes in the input that correspond to part of a
2403    multi-byte match, then look in the log for a state
2404    from which to restart matching.  */
2405 re_dfastate_t *
2406 internal_function
2407 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2408 {
2409   re_dfastate_t *cur_state;
2410   do
2411     {
2412       int max = mctx->state_log_top;
2413       int cur_str_idx = re_string_cur_idx (&mctx->input);
2414
2415       do
2416         {
2417           if (++cur_str_idx > max)
2418             return NULL;
2419           re_string_skip_bytes (&mctx->input, 1);
2420         }
2421       while (mctx->state_log[cur_str_idx] == NULL);
2422
2423       cur_state = merge_state_with_log (err, mctx, NULL);
2424     }
2425   while (*err == REG_NOERROR && cur_state == NULL);
2426   return cur_state;
2427 }
2428
2429 /* Helper functions for transit_state.  */
2430
2431 /* From the node set CUR_NODES, pick up the nodes whose types are
2432    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2433    expression. And register them to use them later for evaluating the
2434    correspoding back references.  */
2435
2436 static reg_errcode_t
2437 internal_function
2438 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2439                            int str_idx)
2440 {
2441   const re_dfa_t *const dfa = mctx->dfa;
2442   int node_idx;
2443   reg_errcode_t err;
2444
2445   /* TODO: This isn't efficient.
2446            Because there might be more than one nodes whose types are
2447            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2448            nodes.
2449            E.g. RE: (a){2}  */
2450   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2451     {
2452       int node = cur_nodes->elems[node_idx];
2453       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2454           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2455           && (dfa->used_bkref_map
2456               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2457         {
2458           err = match_ctx_add_subtop (mctx, node, str_idx);
2459           if (BE (err != REG_NOERROR, 0))
2460             return err;
2461         }
2462     }
2463   return REG_NOERROR;
2464 }
2465
2466 #if 0
2467 /* Return the next state to which the current state STATE will transit by
2468    accepting the current input byte.  */
2469
2470 static re_dfastate_t *
2471 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2472                   re_dfastate_t *state)
2473 {
2474   const re_dfa_t *const dfa = mctx->dfa;
2475   re_node_set next_nodes;
2476   re_dfastate_t *next_state;
2477   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2478   unsigned int context;
2479
2480   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2481   if (BE (*err != REG_NOERROR, 0))
2482     return NULL;
2483   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2484     {
2485       int cur_node = state->nodes.elems[node_cnt];
2486       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2487         {
2488           *err = re_node_set_merge (&next_nodes,
2489                                     dfa->eclosures + dfa->nexts[cur_node]);
2490           if (BE (*err != REG_NOERROR, 0))
2491             {
2492               re_node_set_free (&next_nodes);
2493               return NULL;
2494             }
2495         }
2496     }
2497   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2498   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2499   /* We don't need to check errors here, since the return value of
2500      this function is next_state and ERR is already set.  */
2501
2502   re_node_set_free (&next_nodes);
2503   re_string_skip_bytes (&mctx->input, 1);
2504   return next_state;
2505 }
2506 #endif
2507
2508 #ifdef RE_ENABLE_I18N
2509 static reg_errcode_t
2510 internal_function
2511 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2512 {
2513   const re_dfa_t *const dfa = mctx->dfa;
2514   reg_errcode_t err;
2515   int i;
2516
2517   for (i = 0; i < pstate->nodes.nelem; ++i)
2518     {
2519       re_node_set dest_nodes, *new_nodes;
2520       int cur_node_idx = pstate->nodes.elems[i];
2521       int naccepted, dest_idx;
2522       unsigned int context;
2523       re_dfastate_t *dest_state;
2524
2525       if (!dfa->nodes[cur_node_idx].accept_mb)
2526         continue;
2527
2528       if (dfa->nodes[cur_node_idx].constraint)
2529         {
2530           context = re_string_context_at (&mctx->input,
2531                                           re_string_cur_idx (&mctx->input),
2532                                           mctx->eflags);
2533           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2534                                            context))
2535             continue;
2536         }
2537
2538       /* How many bytes the node can accept?  */
2539       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2540                                            re_string_cur_idx (&mctx->input));
2541       if (naccepted == 0)
2542         continue;
2543
2544       /* The node can accepts `naccepted' bytes.  */
2545       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2546       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2547                                : mctx->max_mb_elem_len);
2548       err = clean_state_log_if_needed (mctx, dest_idx);
2549       if (BE (err != REG_NOERROR, 0))
2550         return err;
2551 #ifdef DEBUG
2552       assert (dfa->nexts[cur_node_idx] != -1);
2553 #endif
2554       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2555
2556       dest_state = mctx->state_log[dest_idx];
2557       if (dest_state == NULL)
2558         dest_nodes = *new_nodes;
2559       else
2560         {
2561           err = re_node_set_init_union (&dest_nodes,
2562                                         dest_state->entrance_nodes, new_nodes);
2563           if (BE (err != REG_NOERROR, 0))
2564             return err;
2565         }
2566       context = re_string_context_at (&mctx->input, dest_idx - 1,
2567                                       mctx->eflags);
2568       mctx->state_log[dest_idx]
2569         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2570       if (dest_state != NULL)
2571         re_node_set_free (&dest_nodes);
2572       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2573         return err;
2574     }
2575   return REG_NOERROR;
2576 }
2577 #endif /* RE_ENABLE_I18N */
2578
2579 static reg_errcode_t
2580 internal_function
2581 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2582 {
2583   const re_dfa_t *const dfa = mctx->dfa;
2584   reg_errcode_t err;
2585   int i;
2586   int cur_str_idx = re_string_cur_idx (&mctx->input);
2587
2588   for (i = 0; i < nodes->nelem; ++i)
2589     {
2590       int dest_str_idx, prev_nelem, bkc_idx;
2591       int node_idx = nodes->elems[i];
2592       unsigned int context;
2593       const re_token_t *node = dfa->nodes + node_idx;
2594       re_node_set *new_dest_nodes;
2595
2596       /* Check whether `node' is a backreference or not.  */
2597       if (node->type != OP_BACK_REF)
2598         continue;
2599
2600       if (node->constraint)
2601         {
2602           context = re_string_context_at (&mctx->input, cur_str_idx,
2603                                           mctx->eflags);
2604           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2605             continue;
2606         }
2607
2608       /* `node' is a backreference.
2609          Check the substring which the substring matched.  */
2610       bkc_idx = mctx->nbkref_ents;
2611       err = get_subexp (mctx, node_idx, cur_str_idx);
2612       if (BE (err != REG_NOERROR, 0))
2613         goto free_return;
2614
2615       /* And add the epsilon closures (which is `new_dest_nodes') of
2616          the backreference to appropriate state_log.  */
2617 #ifdef DEBUG
2618       assert (dfa->nexts[node_idx] != -1);
2619 #endif
2620       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2621         {
2622           int subexp_len;
2623           re_dfastate_t *dest_state;
2624           struct re_backref_cache_entry *bkref_ent;
2625           bkref_ent = mctx->bkref_ents + bkc_idx;
2626           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2627             continue;
2628           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2629           new_dest_nodes = (subexp_len == 0
2630                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2631                             : dfa->eclosures + dfa->nexts[node_idx]);
2632           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2633                           - bkref_ent->subexp_from);
2634           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2635                                           mctx->eflags);
2636           dest_state = mctx->state_log[dest_str_idx];
2637           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2638                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2639           /* Add `new_dest_node' to state_log.  */
2640           if (dest_state == NULL)
2641             {
2642               mctx->state_log[dest_str_idx]
2643                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2644                                             context);
2645               if (BE (mctx->state_log[dest_str_idx] == NULL
2646                       && err != REG_NOERROR, 0))
2647                 goto free_return;
2648             }
2649           else
2650             {
2651               re_node_set dest_nodes;
2652               err = re_node_set_init_union (&dest_nodes,
2653                                             dest_state->entrance_nodes,
2654                                             new_dest_nodes);
2655               if (BE (err != REG_NOERROR, 0))
2656                 {
2657                   re_node_set_free (&dest_nodes);
2658                   goto free_return;
2659                 }
2660               mctx->state_log[dest_str_idx]
2661                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2662               re_node_set_free (&dest_nodes);
2663               if (BE (mctx->state_log[dest_str_idx] == NULL
2664                       && err != REG_NOERROR, 0))
2665                 goto free_return;
2666             }
2667           /* We need to check recursively if the backreference can epsilon
2668              transit.  */
2669           if (subexp_len == 0
2670               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2671             {
2672               err = check_subexp_matching_top (mctx, new_dest_nodes,
2673                                                cur_str_idx);
2674               if (BE (err != REG_NOERROR, 0))
2675                 goto free_return;
2676               err = transit_state_bkref (mctx, new_dest_nodes);
2677               if (BE (err != REG_NOERROR, 0))
2678                 goto free_return;
2679             }
2680         }
2681     }
2682   err = REG_NOERROR;
2683  free_return:
2684   return err;
2685 }
2686
2687 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2688    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2689    Note that we might collect inappropriate candidates here.
2690    However, the cost of checking them strictly here is too high, then we
2691    delay these checking for prune_impossible_nodes().  */
2692
2693 static reg_errcode_t
2694 internal_function
2695 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2696 {
2697   const re_dfa_t *const dfa = mctx->dfa;
2698   int subexp_num, sub_top_idx;
2699   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2700   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2701   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2702   if (cache_idx != -1)
2703     {
2704       const struct re_backref_cache_entry *entry
2705         = mctx->bkref_ents + cache_idx;
2706       do
2707         if (entry->node == bkref_node)
2708           return REG_NOERROR; /* We already checked it.  */
2709       while (entry++->more);
2710     }
2711
2712   subexp_num = dfa->nodes[bkref_node].opr.idx;
2713
2714   /* For each sub expression  */
2715   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2716     {
2717       reg_errcode_t err;
2718       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2719       re_sub_match_last_t *sub_last;
2720       int sub_last_idx, sl_str, bkref_str_off;
2721
2722       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2723         continue; /* It isn't related.  */
2724
2725       sl_str = sub_top->str_idx;
2726       bkref_str_off = bkref_str_idx;
2727       /* At first, check the last node of sub expressions we already
2728          evaluated.  */
2729       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2730         {
2731           int sl_str_diff;
2732           sub_last = sub_top->lasts[sub_last_idx];
2733           sl_str_diff = sub_last->str_idx - sl_str;
2734           /* The matched string by the sub expression match with the substring
2735              at the back reference?  */
2736           if (sl_str_diff > 0)
2737             {
2738               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2739                 {
2740                   /* Not enough chars for a successful match.  */
2741                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2742                     break;
2743
2744                   err = clean_state_log_if_needed (mctx,
2745                                                    bkref_str_off
2746                                                    + sl_str_diff);
2747                   if (BE (err != REG_NOERROR, 0))
2748                     return err;
2749                   buf = (const char *) re_string_get_buffer (&mctx->input);
2750                 }
2751               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2752                 /* We don't need to search this sub expression any more.  */
2753                 break;
2754             }
2755           bkref_str_off += sl_str_diff;
2756           sl_str += sl_str_diff;
2757           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2758                                 bkref_str_idx);
2759
2760           /* Reload buf, since the preceding call might have reallocated
2761              the buffer.  */
2762           buf = (const char *) re_string_get_buffer (&mctx->input);
2763
2764           if (err == REG_NOMATCH)
2765             continue;
2766           if (BE (err != REG_NOERROR, 0))
2767             return err;
2768         }
2769
2770       if (sub_last_idx < sub_top->nlasts)
2771         continue;
2772       if (sub_last_idx > 0)
2773         ++sl_str;
2774       /* Then, search for the other last nodes of the sub expression.  */
2775       for (; sl_str <= bkref_str_idx; ++sl_str)
2776         {
2777           int cls_node, sl_str_off;
2778           const re_node_set *nodes;
2779           sl_str_off = sl_str - sub_top->str_idx;
2780           /* The matched string by the sub expression match with the substring
2781              at the back reference?  */
2782           if (sl_str_off > 0)
2783             {
2784               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2785                 {
2786                   /* If we are at the end of the input, we cannot match.  */
2787                   if (bkref_str_off >= mctx->input.len)
2788                     break;
2789
2790                   err = extend_buffers (mctx);
2791                   if (BE (err != REG_NOERROR, 0))
2792                     return err;
2793
2794                   buf = (const char *) re_string_get_buffer (&mctx->input);
2795                 }
2796               if (buf [bkref_str_off++] != buf[sl_str - 1])
2797                 break; /* We don't need to search this sub expression
2798                           any more.  */
2799             }
2800           if (mctx->state_log[sl_str] == NULL)
2801             continue;
2802           /* Does this state have a ')' of the sub expression?  */
2803           nodes = &mctx->state_log[sl_str]->nodes;
2804           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2805                                        OP_CLOSE_SUBEXP);
2806           if (cls_node == -1)
2807             continue; /* No.  */
2808           if (sub_top->path == NULL)
2809             {
2810               sub_top->path = calloc (sizeof (state_array_t),
2811                                       sl_str - sub_top->str_idx + 1);
2812               if (sub_top->path == NULL)
2813                 return REG_ESPACE;
2814             }
2815           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2816              in the current context?  */
2817           err = check_arrival (mctx, sub_top->path, sub_top->node,
2818                                sub_top->str_idx, cls_node, sl_str,
2819                                OP_CLOSE_SUBEXP);
2820           if (err == REG_NOMATCH)
2821               continue;
2822           if (BE (err != REG_NOERROR, 0))
2823               return err;
2824           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2825           if (BE (sub_last == NULL, 0))
2826             return REG_ESPACE;
2827           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2828                                 bkref_str_idx);
2829           if (err == REG_NOMATCH)
2830             continue;
2831         }
2832     }
2833   return REG_NOERROR;
2834 }
2835
2836 /* Helper functions for get_subexp().  */
2837
2838 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2839    If it can arrive, register the sub expression expressed with SUB_TOP
2840    and SUB_LAST.  */
2841
2842 static reg_errcode_t
2843 internal_function
2844 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2845                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2846 {
2847   reg_errcode_t err;
2848   int to_idx;
2849   /* Can the subexpression arrive the back reference?  */
2850   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2851                        sub_last->str_idx, bkref_node, bkref_str,
2852                        OP_OPEN_SUBEXP);
2853   if (err != REG_NOERROR)
2854     return err;
2855   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2856                              sub_last->str_idx);
2857   if (BE (err != REG_NOERROR, 0))
2858     return err;
2859   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2860   return clean_state_log_if_needed (mctx, to_idx);
2861 }
2862
2863 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2864    Search '(' if FL_OPEN, or search ')' otherwise.
2865    TODO: This function isn't efficient...
2866          Because there might be more than one nodes whose types are
2867          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2868          nodes.
2869          E.g. RE: (a){2}  */
2870
2871 static int
2872 internal_function
2873 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2874                   int subexp_idx, int type)
2875 {
2876   int cls_idx;
2877   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2878     {
2879       int cls_node = nodes->elems[cls_idx];
2880       const re_token_t *node = dfa->nodes + cls_node;
2881       if (node->type == type
2882           && node->opr.idx == subexp_idx)
2883         return cls_node;
2884     }
2885   return -1;
2886 }
2887
2888 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2889    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2890    heavily reused.
2891    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2892
2893 static reg_errcode_t
2894 internal_function
2895 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2896                int top_str, int last_node, int last_str, int type)
2897 {
2898   const re_dfa_t *const dfa = mctx->dfa;
2899   reg_errcode_t err = REG_NOERROR;
2900   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2901   re_dfastate_t *cur_state = NULL;
2902   re_node_set *cur_nodes, next_nodes;
2903   re_dfastate_t **backup_state_log;
2904   unsigned int context;
2905
2906   subexp_num = dfa->nodes[top_node].opr.idx;
2907   /* Extend the buffer if we need.  */
2908   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2909     {
2910       re_dfastate_t **new_array;
2911       int old_alloc = path->alloc;
2912       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2913       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2914       if (BE (new_array == NULL, 0))
2915         {
2916           path->alloc = old_alloc;
2917           return REG_ESPACE;
2918         }
2919       path->array = new_array;
2920       memset (new_array + old_alloc, '\0',
2921               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2922     }
2923
2924   str_idx = path->next_idx ? path->next_idx : top_str;
2925
2926   /* Temporary modify MCTX.  */
2927   backup_state_log = mctx->state_log;
2928   backup_cur_idx = mctx->input.cur_idx;
2929   mctx->state_log = path->array;
2930   mctx->input.cur_idx = str_idx;
2931
2932   /* Setup initial node set.  */
2933   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2934   if (str_idx == top_str)
2935     {
2936       err = re_node_set_init_1 (&next_nodes, top_node);
2937       if (BE (err != REG_NOERROR, 0))
2938         return err;
2939       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2940       if (BE (err != REG_NOERROR, 0))
2941         {
2942           re_node_set_free (&next_nodes);
2943           return err;
2944         }
2945     }
2946   else
2947     {
2948       cur_state = mctx->state_log[str_idx];
2949       if (cur_state && cur_state->has_backref)
2950         {
2951           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2952           if (BE (err != REG_NOERROR, 0))
2953             return err;
2954         }
2955       else
2956         re_node_set_init_empty (&next_nodes);
2957     }
2958   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2959     {
2960       if (next_nodes.nelem)
2961         {
2962           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2963                                     subexp_num, type);
2964           if (BE (err != REG_NOERROR, 0))
2965             {
2966               re_node_set_free (&next_nodes);
2967               return err;
2968             }
2969         }
2970       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2971       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2972         {
2973           re_node_set_free (&next_nodes);
2974           return err;
2975         }
2976       mctx->state_log[str_idx] = cur_state;
2977     }
2978
2979   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2980     {
2981       re_node_set_empty (&next_nodes);
2982       if (mctx->state_log[str_idx + 1])
2983         {
2984           err = re_node_set_merge (&next_nodes,
2985                                    &mctx->state_log[str_idx + 1]->nodes);
2986           if (BE (err != REG_NOERROR, 0))
2987             {
2988               re_node_set_free (&next_nodes);
2989               return err;
2990             }
2991         }
2992       if (cur_state)
2993         {
2994           err = check_arrival_add_next_nodes (mctx, str_idx,
2995                                               &cur_state->non_eps_nodes,
2996                                               &next_nodes);
2997           if (BE (err != REG_NOERROR, 0))
2998             {
2999               re_node_set_free (&next_nodes);
3000               return err;
3001             }
3002         }
3003       ++str_idx;
3004       if (next_nodes.nelem)
3005         {
3006           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3007           if (BE (err != REG_NOERROR, 0))
3008             {
3009               re_node_set_free (&next_nodes);
3010               return err;
3011             }
3012           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3013                                     subexp_num, type);
3014           if (BE (err != REG_NOERROR, 0))
3015             {
3016               re_node_set_free (&next_nodes);
3017               return err;
3018             }
3019         }
3020       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3021       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3022       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3023         {
3024           re_node_set_free (&next_nodes);
3025           return err;
3026         }
3027       mctx->state_log[str_idx] = cur_state;
3028       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3029     }
3030   re_node_set_free (&next_nodes);
3031   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3032                : &mctx->state_log[last_str]->nodes);
3033   path->next_idx = str_idx;
3034
3035   /* Fix MCTX.  */
3036   mctx->state_log = backup_state_log;
3037   mctx->input.cur_idx = backup_cur_idx;
3038
3039   /* Then check the current node set has the node LAST_NODE.  */
3040   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3041     return REG_NOERROR;
3042
3043   return REG_NOMATCH;
3044 }
3045
3046 /* Helper functions for check_arrival.  */
3047
3048 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3049    to NEXT_NODES.
3050    TODO: This function is similar to the functions transit_state*(),
3051          however this function has many additional works.
3052          Can't we unify them?  */
3053
3054 static reg_errcode_t
3055 internal_function
3056 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3057                               re_node_set *cur_nodes, re_node_set *next_nodes)
3058 {
3059   const re_dfa_t *const dfa = mctx->dfa;
3060   int result;
3061   int cur_idx;
3062   reg_errcode_t err = REG_NOERROR;
3063   re_node_set union_set;
3064   re_node_set_init_empty (&union_set);
3065   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3066     {
3067       int naccepted = 0;
3068       int cur_node = cur_nodes->elems[cur_idx];
3069 #ifdef DEBUG
3070       re_token_type_t type = dfa->nodes[cur_node].type;
3071       assert (!IS_EPSILON_NODE (type));
3072 #endif
3073 #ifdef RE_ENABLE_I18N
3074       /* If the node may accept `multi byte'.  */
3075       if (dfa->nodes[cur_node].accept_mb)
3076         {
3077           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3078                                                str_idx);
3079           if (naccepted > 1)
3080             {
3081               re_dfastate_t *dest_state;
3082               int next_node = dfa->nexts[cur_node];
3083               int next_idx = str_idx + naccepted;
3084               dest_state = mctx->state_log[next_idx];
3085               re_node_set_empty (&union_set);
3086               if (dest_state)
3087                 {
3088                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3089                   if (BE (err != REG_NOERROR, 0))
3090                     {
3091                       re_node_set_free (&union_set);
3092                       return err;
3093                     }
3094                 }
3095               result = re_node_set_insert (&union_set, next_node);
3096               if (BE (result < 0, 0))
3097                 {
3098                   re_node_set_free (&union_set);
3099                   return REG_ESPACE;
3100                 }
3101               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3102                                                             &union_set);
3103               if (BE (mctx->state_log[next_idx] == NULL
3104                       && err != REG_NOERROR, 0))
3105                 {
3106                   re_node_set_free (&union_set);
3107                   return err;
3108                 }
3109             }
3110         }
3111 #endif /* RE_ENABLE_I18N */
3112       if (naccepted
3113           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3114         {
3115           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3116           if (BE (result < 0, 0))
3117             {
3118               re_node_set_free (&union_set);
3119               return REG_ESPACE;
3120             }
3121         }
3122     }
3123   re_node_set_free (&union_set);
3124   return REG_NOERROR;
3125 }
3126
3127 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3128    CUR_NODES, however exclude the nodes which are:
3129     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3130     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3131 */
3132
3133 static reg_errcode_t
3134 internal_function
3135 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3136                           int ex_subexp, int type)
3137 {
3138   reg_errcode_t err;
3139   int idx, outside_node;
3140   re_node_set new_nodes;
3141 #ifdef DEBUG
3142   assert (cur_nodes->nelem);
3143 #endif
3144   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3145   if (BE (err != REG_NOERROR, 0))
3146     return err;
3147   /* Create a new node set NEW_NODES with the nodes which are epsilon
3148      closures of the node in CUR_NODES.  */
3149
3150   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3151     {
3152       int cur_node = cur_nodes->elems[idx];
3153       const re_node_set *eclosure = dfa->eclosures + cur_node;
3154       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3155       if (outside_node == -1)
3156         {
3157           /* There are no problematic nodes, just merge them.  */
3158           err = re_node_set_merge (&new_nodes, eclosure);
3159           if (BE (err != REG_NOERROR, 0))
3160             {
3161               re_node_set_free (&new_nodes);
3162               return err;
3163             }
3164         }
3165       else
3166         {
3167           /* There are problematic nodes, re-calculate incrementally.  */
3168           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3169                                               ex_subexp, type);
3170           if (BE (err != REG_NOERROR, 0))
3171             {
3172               re_node_set_free (&new_nodes);
3173               return err;
3174             }
3175         }
3176     }
3177   re_node_set_free (cur_nodes);
3178   *cur_nodes = new_nodes;
3179   return REG_NOERROR;
3180 }
3181
3182 /* Helper function for check_arrival_expand_ecl.
3183    Check incrementally the epsilon closure of TARGET, and if it isn't
3184    problematic append it to DST_NODES.  */
3185
3186 static reg_errcode_t
3187 internal_function
3188 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3189                               int target, int ex_subexp, int type)
3190 {
3191   int cur_node;
3192   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3193     {
3194       int err;
3195
3196       if (dfa->nodes[cur_node].type == type
3197           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3198         {
3199           if (type == OP_CLOSE_SUBEXP)
3200             {
3201               err = re_node_set_insert (dst_nodes, cur_node);
3202               if (BE (err == -1, 0))
3203                 return REG_ESPACE;
3204             }
3205           break;
3206         }
3207       err = re_node_set_insert (dst_nodes, cur_node);
3208       if (BE (err == -1, 0))
3209         return REG_ESPACE;
3210       if (dfa->edests[cur_node].nelem == 0)
3211         break;
3212       if (dfa->edests[cur_node].nelem == 2)
3213         {
3214           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3215                                               dfa->edests[cur_node].elems[1],
3216                                               ex_subexp, type);
3217           if (BE (err != REG_NOERROR, 0))
3218             return err;
3219         }
3220       cur_node = dfa->edests[cur_node].elems[0];
3221     }
3222   return REG_NOERROR;
3223 }
3224
3225
3226 /* For all the back references in the current state, calculate the
3227    destination of the back references by the appropriate entry
3228    in MCTX->BKREF_ENTS.  */
3229
3230 static reg_errcode_t
3231 internal_function
3232 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3233                     int cur_str, int subexp_num, int type)
3234 {
3235   const re_dfa_t *const dfa = mctx->dfa;
3236   reg_errcode_t err;
3237   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3238   struct re_backref_cache_entry *ent;
3239
3240   if (cache_idx_start == -1)
3241     return REG_NOERROR;
3242
3243  restart:
3244   ent = mctx->bkref_ents + cache_idx_start;
3245   do
3246     {
3247       int to_idx, next_node;
3248
3249       /* Is this entry ENT is appropriate?  */
3250       if (!re_node_set_contains (cur_nodes, ent->node))
3251         continue; /* No.  */
3252
3253       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3254       /* Calculate the destination of the back reference, and append it
3255          to MCTX->STATE_LOG.  */
3256       if (to_idx == cur_str)
3257         {
3258           /* The backreference did epsilon transit, we must re-check all the
3259              node in the current state.  */
3260           re_node_set new_dests;
3261           reg_errcode_t err2, err3;
3262           next_node = dfa->edests[ent->node].elems[0];
3263           if (re_node_set_contains (cur_nodes, next_node))
3264             continue;
3265           err = re_node_set_init_1 (&new_dests, next_node);
3266           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3267           err3 = re_node_set_merge (cur_nodes, &new_dests);
3268           re_node_set_free (&new_dests);
3269           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3270                   || err3 != REG_NOERROR, 0))
3271             {
3272               err = (err != REG_NOERROR ? err
3273                      : (err2 != REG_NOERROR ? err2 : err3));
3274               return err;
3275             }
3276           /* TODO: It is still inefficient...  */
3277           goto restart;
3278         }
3279       else
3280         {
3281           re_node_set union_set;
3282           next_node = dfa->nexts[ent->node];
3283           if (mctx->state_log[to_idx])
3284             {
3285               int ret;
3286               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3287                                         next_node))
3288                 continue;
3289               err = re_node_set_init_copy (&union_set,
3290                                            &mctx->state_log[to_idx]->nodes);
3291               ret = re_node_set_insert (&union_set, next_node);
3292               if (BE (err != REG_NOERROR || ret < 0, 0))
3293                 {
3294                   re_node_set_free (&union_set);
3295                   err = err != REG_NOERROR ? err : REG_ESPACE;
3296                   return err;
3297                 }
3298             }
3299           else
3300             {
3301               err = re_node_set_init_1 (&union_set, next_node);
3302               if (BE (err != REG_NOERROR, 0))
3303                 return err;
3304             }
3305           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3306           re_node_set_free (&union_set);
3307           if (BE (mctx->state_log[to_idx] == NULL
3308                   && err != REG_NOERROR, 0))
3309             return err;
3310         }
3311     }
3312   while (ent++->more);
3313   return REG_NOERROR;
3314 }
3315
3316 /* Build transition table for the state.
3317    Return 1 if succeeded, otherwise return NULL.  */
3318
3319 static int
3320 internal_function
3321 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3322 {
3323   reg_errcode_t err;
3324   int i, j, ch, need_word_trtable = 0;
3325   bitset_word_t elem, mask;
3326   bool dests_node_malloced = false;
3327   bool dest_states_malloced = false;
3328   int ndests; /* Number of the destination states from `state'.  */
3329   re_dfastate_t **trtable;
3330   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3331   re_node_set follows, *dests_node;
3332   bitset_t *dests_ch;
3333   bitset_t acceptable;
3334
3335   struct dests_alloc
3336   {
3337     re_node_set dests_node[SBC_MAX];
3338     bitset_t dests_ch[SBC_MAX];
3339   } *dests_alloc;
3340
3341   /* We build DFA states which corresponds to the destination nodes
3342      from `state'.  `dests_node[i]' represents the nodes which i-th
3343      destination state contains, and `dests_ch[i]' represents the
3344      characters which i-th destination state accepts.  */
3345 #ifdef HAVE_ALLOCA
3346   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3347     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3348   else
3349 #endif
3350     {
3351       dests_alloc = re_malloc (struct dests_alloc, 1);
3352       if (BE (dests_alloc == NULL, 0))
3353         return 0;
3354       dests_node_malloced = true;
3355     }
3356   dests_node = dests_alloc->dests_node;
3357   dests_ch = dests_alloc->dests_ch;
3358
3359   /* Initialize transiton table.  */
3360   state->word_trtable = state->trtable = NULL;
3361
3362   /* At first, group all nodes belonging to `state' into several
3363      destinations.  */
3364   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3365   if (BE (ndests <= 0, 0))
3366     {
3367       if (dests_node_malloced)
3368         free (dests_alloc);
3369       /* Return 0 in case of an error, 1 otherwise.  */
3370       if (ndests == 0)
3371         {
3372           state->trtable = (re_dfastate_t **)
3373             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3374           return 1;
3375         }
3376       return 0;
3377     }
3378
3379   err = re_node_set_alloc (&follows, ndests + 1);
3380   if (BE (err != REG_NOERROR, 0))
3381     goto out_free;
3382
3383   /* Avoid arithmetic overflow in size calculation.  */
3384   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3385             / (3 * sizeof (re_dfastate_t *)))
3386            < ndests),
3387           0))
3388     goto out_free;
3389
3390 #ifdef HAVE_ALLOCA
3391   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3392                          + ndests * 3 * sizeof (re_dfastate_t *)))
3393     dest_states = (re_dfastate_t **)
3394       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3395   else
3396 #endif
3397     {
3398       dest_states = (re_dfastate_t **)
3399         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3400       if (BE (dest_states == NULL, 0))
3401         {
3402 out_free:
3403           if (dest_states_malloced)
3404             free (dest_states);
3405           re_node_set_free (&follows);
3406           for (i = 0; i < ndests; ++i)
3407             re_node_set_free (dests_node + i);
3408           if (dests_node_malloced)
3409             free (dests_alloc);
3410           return 0;
3411         }
3412       dest_states_malloced = true;
3413     }
3414   dest_states_word = dest_states + ndests;
3415   dest_states_nl = dest_states_word + ndests;
3416   bitset_empty (acceptable);
3417
3418   /* Then build the states for all destinations.  */
3419   for (i = 0; i < ndests; ++i)
3420     {
3421       int next_node;
3422       re_node_set_empty (&follows);
3423       /* Merge the follows of this destination states.  */
3424       for (j = 0; j < dests_node[i].nelem; ++j)
3425         {
3426           next_node = dfa->nexts[dests_node[i].elems[j]];
3427           if (next_node != -1)
3428             {
3429               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3430               if (BE (err != REG_NOERROR, 0))
3431                 goto out_free;
3432             }
3433         }
3434       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3435       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3436         goto out_free;
3437       /* If the new state has context constraint,
3438          build appropriate states for these contexts.  */
3439       if (dest_states[i]->has_constraint)
3440         {
3441           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3442                                                           CONTEXT_WORD);
3443           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3444             goto out_free;
3445
3446           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3447             need_word_trtable = 1;
3448
3449           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3450                                                         CONTEXT_NEWLINE);
3451           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3452             goto out_free;
3453         }
3454       else
3455         {
3456           dest_states_word[i] = dest_states[i];
3457           dest_states_nl[i] = dest_states[i];
3458         }
3459       bitset_merge (acceptable, dests_ch[i]);
3460     }
3461
3462   if (!BE (need_word_trtable, 0))
3463     {
3464       /* We don't care about whether the following character is a word
3465          character, or we are in a single-byte character set so we can
3466          discern by looking at the character code: allocate a
3467          256-entry transition table.  */
3468       trtable = state->trtable =
3469         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3470       if (BE (trtable == NULL, 0))
3471         goto out_free;
3472
3473       /* For all characters ch...:  */
3474       for (i = 0; i < BITSET_WORDS; ++i)
3475         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3476              elem;
3477              mask <<= 1, elem >>= 1, ++ch)
3478           if (BE (elem & 1, 0))
3479             {
3480               /* There must be exactly one destination which accepts
3481                  character ch.  See group_nodes_into_DFAstates.  */
3482               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3483                 ;
3484
3485               /* j-th destination accepts the word character ch.  */
3486               if (dfa->word_char[i] & mask)
3487                 trtable[ch] = dest_states_word[j];
3488               else
3489                 trtable[ch] = dest_states[j];
3490             }
3491     }
3492   else
3493     {
3494       /* We care about whether the following character is a word
3495          character, and we are in a multi-byte character set: discern
3496          by looking at the character code: build two 256-entry
3497          transition tables, one starting at trtable[0] and one
3498          starting at trtable[SBC_MAX].  */
3499       trtable = state->word_trtable =
3500         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3501       if (BE (trtable == NULL, 0))
3502         goto out_free;
3503
3504       /* For all characters ch...:  */
3505       for (i = 0; i < BITSET_WORDS; ++i)
3506         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3507              elem;
3508              mask <<= 1, elem >>= 1, ++ch)
3509           if (BE (elem & 1, 0))
3510             {
3511               /* There must be exactly one destination which accepts
3512                  character ch.  See group_nodes_into_DFAstates.  */
3513               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3514                 ;
3515
3516               /* j-th destination accepts the word character ch.  */
3517               trtable[ch] = dest_states[j];
3518               trtable[ch + SBC_MAX] = dest_states_word[j];
3519             }
3520     }
3521
3522   /* new line */
3523   if (bitset_contain (acceptable, NEWLINE_CHAR))
3524     {
3525       /* The current state accepts newline character.  */
3526       for (j = 0; j < ndests; ++j)
3527         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3528           {
3529             /* k-th destination accepts newline character.  */
3530             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3531             if (need_word_trtable)
3532               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3533             /* There must be only one destination which accepts
3534                newline.  See group_nodes_into_DFAstates.  */
3535             break;
3536           }
3537     }
3538
3539   if (dest_states_malloced)
3540     free (dest_states);
3541
3542   re_node_set_free (&follows);
3543   for (i = 0; i < ndests; ++i)
3544     re_node_set_free (dests_node + i);
3545
3546   if (dests_node_malloced)
3547     free (dests_alloc);
3548
3549   return 1;
3550 }
3551
3552 /* Group all nodes belonging to STATE into several destinations.
3553    Then for all destinations, set the nodes belonging to the destination
3554    to DESTS_NODE[i] and set the characters accepted by the destination
3555    to DEST_CH[i].  This function return the number of destinations.  */
3556
3557 static int
3558 internal_function
3559 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3560                             re_node_set *dests_node, bitset_t *dests_ch)
3561 {
3562   reg_errcode_t err;
3563   int result;
3564   int i, j, k;
3565   int ndests; /* Number of the destinations from `state'.  */
3566   bitset_t accepts; /* Characters a node can accept.  */
3567   const re_node_set *cur_nodes = &state->nodes;
3568   bitset_empty (accepts);
3569   ndests = 0;
3570
3571   /* For all the nodes belonging to `state',  */
3572   for (i = 0; i < cur_nodes->nelem; ++i)
3573     {
3574       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3575       re_token_type_t type = node->type;
3576       unsigned int constraint = node->constraint;
3577
3578       /* Enumerate all single byte character this node can accept.  */
3579       if (type == CHARACTER)
3580         bitset_set (accepts, node->opr.c);
3581       else if (type == SIMPLE_BRACKET)
3582         {
3583           bitset_merge (accepts, node->opr.sbcset);
3584         }
3585       else if (type == OP_PERIOD)
3586         {
3587 #ifdef RE_ENABLE_I18N
3588           if (dfa->mb_cur_max > 1)
3589             bitset_merge (accepts, dfa->sb_char);
3590           else
3591 #endif
3592             bitset_set_all (accepts);
3593           if (!(dfa->syntax & RE_DOT_NEWLINE))
3594             bitset_clear (accepts, '\n');
3595           if (dfa->syntax & RE_DOT_NOT_NULL)
3596             bitset_clear (accepts, '\0');
3597         }
3598 #ifdef RE_ENABLE_I18N
3599       else if (type == OP_UTF8_PERIOD)
3600         {
3601           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3602           if (!(dfa->syntax & RE_DOT_NEWLINE))
3603             bitset_clear (accepts, '\n');
3604           if (dfa->syntax & RE_DOT_NOT_NULL)
3605             bitset_clear (accepts, '\0');
3606         }
3607 #endif
3608       else
3609         continue;
3610
3611       /* Check the `accepts' and sift the characters which are not
3612          match it the context.  */
3613       if (constraint)
3614         {
3615           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3616             {
3617               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3618               bitset_empty (accepts);
3619               if (accepts_newline)
3620                 bitset_set (accepts, NEWLINE_CHAR);
3621               else
3622                 continue;
3623             }
3624           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3625             {
3626               bitset_empty (accepts);
3627               continue;
3628             }
3629
3630           if (constraint & NEXT_WORD_CONSTRAINT)
3631             {
3632               bitset_word_t any_set = 0;
3633               if (type == CHARACTER && !node->word_char)
3634                 {
3635                   bitset_empty (accepts);
3636                   continue;
3637                 }
3638 #ifdef RE_ENABLE_I18N
3639               if (dfa->mb_cur_max > 1)
3640                 for (j = 0; j < BITSET_WORDS; ++j)
3641                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3642               else
3643 #endif
3644                 for (j = 0; j < BITSET_WORDS; ++j)
3645                   any_set |= (accepts[j] &= dfa->word_char[j]);
3646               if (!any_set)
3647                 continue;
3648             }
3649           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3650             {
3651               bitset_word_t any_set = 0;
3652               if (type == CHARACTER && node->word_char)
3653                 {
3654                   bitset_empty (accepts);
3655                   continue;
3656                 }
3657 #ifdef RE_ENABLE_I18N
3658               if (dfa->mb_cur_max > 1)
3659                 for (j = 0; j < BITSET_WORDS; ++j)
3660                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3661               else
3662 #endif
3663                 for (j = 0; j < BITSET_WORDS; ++j)
3664                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3665               if (!any_set)
3666                 continue;
3667             }
3668         }
3669
3670       /* Then divide `accepts' into DFA states, or create a new
3671          state.  Above, we make sure that accepts is not empty.  */
3672       for (j = 0; j < ndests; ++j)
3673         {
3674           bitset_t intersec; /* Intersection sets, see below.  */
3675           bitset_t remains;
3676           /* Flags, see below.  */
3677           bitset_word_t has_intersec, not_subset, not_consumed;
3678
3679           /* Optimization, skip if this state doesn't accept the character.  */
3680           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3681             continue;
3682
3683           /* Enumerate the intersection set of this state and `accepts'.  */
3684           has_intersec = 0;
3685           for (k = 0; k < BITSET_WORDS; ++k)
3686             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3687           /* And skip if the intersection set is empty.  */
3688           if (!has_intersec)
3689             continue;
3690
3691           /* Then check if this state is a subset of `accepts'.  */
3692           not_subset = not_consumed = 0;
3693           for (k = 0; k < BITSET_WORDS; ++k)
3694             {
3695               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3696               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3697             }
3698
3699           /* If this state isn't a subset of `accepts', create a
3700              new group state, which has the `remains'. */
3701           if (not_subset)
3702             {
3703               bitset_copy (dests_ch[ndests], remains);
3704               bitset_copy (dests_ch[j], intersec);
3705               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3706               if (BE (err != REG_NOERROR, 0))
3707                 goto error_return;
3708               ++ndests;
3709             }
3710
3711           /* Put the position in the current group. */
3712           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3713           if (BE (result < 0, 0))
3714             goto error_return;
3715
3716           /* If all characters are consumed, go to next node. */
3717           if (!not_consumed)
3718             break;
3719         }
3720       /* Some characters remain, create a new group. */
3721       if (j == ndests)
3722         {
3723           bitset_copy (dests_ch[ndests], accepts);
3724           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3725           if (BE (err != REG_NOERROR, 0))
3726             goto error_return;
3727           ++ndests;
3728           bitset_empty (accepts);
3729         }
3730     }
3731   return ndests;
3732  error_return:
3733   for (j = 0; j < ndests; ++j)
3734     re_node_set_free (dests_node + j);
3735   return -1;
3736 }
3737
3738 #ifdef RE_ENABLE_I18N
3739 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3740    Return the number of the bytes the node accepts.
3741    STR_IDX is the current index of the input string.
3742
3743    This function handles the nodes which can accept one character, or
3744    one collating element like '.', '[a-z]', opposite to the other nodes
3745    can only accept one byte.  */
3746
3747 static int
3748 internal_function
3749 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3750                          const re_string_t *input, int str_idx)
3751 {
3752   const re_token_t *node = dfa->nodes + node_idx;
3753   int char_len, elem_len;
3754   int i;
3755   wint_t wc;
3756
3757   if (BE (node->type == OP_UTF8_PERIOD, 0))
3758     {
3759       unsigned char c = re_string_byte_at (input, str_idx), d;
3760       if (BE (c < 0xc2, 1))
3761         return 0;
3762
3763       if (str_idx + 2 > input->len)
3764         return 0;
3765
3766       d = re_string_byte_at (input, str_idx + 1);
3767       if (c < 0xe0)
3768         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3769       else if (c < 0xf0)
3770         {
3771           char_len = 3;
3772           if (c == 0xe0 && d < 0xa0)
3773             return 0;
3774         }
3775       else if (c < 0xf8)
3776         {
3777           char_len = 4;
3778           if (c == 0xf0 && d < 0x90)
3779             return 0;
3780         }
3781       else if (c < 0xfc)
3782         {
3783           char_len = 5;
3784           if (c == 0xf8 && d < 0x88)
3785             return 0;
3786         }
3787       else if (c < 0xfe)
3788         {
3789           char_len = 6;
3790           if (c == 0xfc && d < 0x84)
3791             return 0;
3792         }
3793       else
3794         return 0;
3795
3796       if (str_idx + char_len > input->len)
3797         return 0;
3798
3799       for (i = 1; i < char_len; ++i)
3800         {
3801           d = re_string_byte_at (input, str_idx + i);
3802           if (d < 0x80 || d > 0xbf)
3803             return 0;
3804         }
3805       return char_len;
3806     }
3807
3808   char_len = re_string_char_size_at (input, str_idx);
3809   if (node->type == OP_PERIOD)
3810     {
3811       if (char_len <= 1)
3812         return 0;
3813       /* FIXME: I don't think this if is needed, as both '\n'
3814          and '\0' are char_len == 1.  */
3815       /* '.' accepts any one character except the following two cases.  */
3816       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3817            re_string_byte_at (input, str_idx) == '\n') ||
3818           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3819            re_string_byte_at (input, str_idx) == '\0'))
3820         return 0;
3821       return char_len;
3822     }
3823
3824   elem_len = re_string_elem_size_at (input, str_idx);
3825   wc = __btowc(*(input->mbs+str_idx));
3826   if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3827     return 0;
3828
3829   if (node->type == COMPLEX_BRACKET)
3830     {
3831       const re_charset_t *cset = node->opr.mbcset;
3832 # ifdef _LIBC
3833       const unsigned char *pin
3834         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3835       int j;
3836       uint32_t nrules;
3837 # endif /* _LIBC */
3838       int match_len = 0;
3839       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3840                     ? re_string_wchar_at (input, str_idx) : 0);
3841
3842       /* match with multibyte character?  */
3843       for (i = 0; i < cset->nmbchars; ++i)
3844         if (wc == cset->mbchars[i])
3845           {
3846             match_len = char_len;
3847             goto check_node_accept_bytes_match;
3848           }
3849       /* match with character_class?  */
3850       for (i = 0; i < cset->nchar_classes; ++i)
3851         {
3852           wctype_t wt = cset->char_classes[i];
3853           if (__iswctype (wc, wt))
3854             {
3855               match_len = char_len;
3856               goto check_node_accept_bytes_match;
3857             }
3858         }
3859
3860 # ifdef _LIBC
3861       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3862       if (nrules != 0)
3863         {
3864           unsigned int in_collseq = 0;
3865           const int32_t *table, *indirect;
3866           const unsigned char *weights, *extra;
3867           const char *collseqwc;
3868           /* This #include defines a local function!  */
3869 #  include <locale/weight.h>
3870
3871           /* match with collating_symbol?  */
3872           if (cset->ncoll_syms)
3873             extra = (const unsigned char *)
3874               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3875           for (i = 0; i < cset->ncoll_syms; ++i)
3876             {
3877               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3878               /* Compare the length of input collating element and
3879                  the length of current collating element.  */
3880               if (*coll_sym != elem_len)
3881                 continue;
3882               /* Compare each bytes.  */
3883               for (j = 0; j < *coll_sym; j++)
3884                 if (pin[j] != coll_sym[1 + j])
3885                   break;
3886               if (j == *coll_sym)
3887                 {
3888                   /* Match if every bytes is equal.  */
3889                   match_len = j;
3890                   goto check_node_accept_bytes_match;
3891                 }
3892             }
3893
3894           if (cset->nranges)
3895             {
3896               if (elem_len <= char_len)
3897                 {
3898                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3899                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3900                 }
3901               else
3902                 in_collseq = find_collation_sequence_value (pin, elem_len);
3903             }
3904           /* match with range expression?  */
3905           for (i = 0; i < cset->nranges; ++i)
3906             if (cset->range_starts[i] <= in_collseq
3907                 && in_collseq <= cset->range_ends[i])
3908               {
3909                 match_len = elem_len;
3910                 goto check_node_accept_bytes_match;
3911               }
3912
3913           /* match with equivalence_class?  */
3914           if (cset->nequiv_classes)
3915             {
3916               const unsigned char *cp = pin;
3917               table = (const int32_t *)
3918                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3919               weights = (const unsigned char *)
3920                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3921               extra = (const unsigned char *)
3922                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3923               indirect = (const int32_t *)
3924                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3925               int32_t idx = findidx (&cp);
3926               if (idx > 0)
3927                 for (i = 0; i < cset->nequiv_classes; ++i)
3928                   {
3929                     int32_t equiv_class_idx = cset->equiv_classes[i];
3930                     size_t weight_len = weights[idx & 0xffffff];
3931                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3932                         && (idx >> 24) == (equiv_class_idx >> 24))
3933                       {
3934                         int cnt = 0;
3935
3936                         idx &= 0xffffff;
3937                         equiv_class_idx &= 0xffffff;
3938
3939                         while (cnt <= weight_len
3940                                && (weights[equiv_class_idx + 1 + cnt]
3941                                    == weights[idx + 1 + cnt]))
3942                           ++cnt;
3943                         if (cnt > weight_len)
3944                           {
3945                             match_len = elem_len;
3946                             goto check_node_accept_bytes_match;
3947                           }
3948                       }
3949                   }
3950             }
3951         }
3952       else
3953 # endif /* _LIBC */
3954         {
3955           /* match with range expression?  */
3956 #if __GNUC__ >= 2
3957           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3958 #else
3959           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3960           cmp_buf[2] = wc;
3961 #endif
3962           for (i = 0; i < cset->nranges; ++i)
3963             {
3964               cmp_buf[0] = cset->range_starts[i];
3965               cmp_buf[4] = cset->range_ends[i];
3966               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3967                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3968                 {
3969                   match_len = char_len;
3970                   goto check_node_accept_bytes_match;
3971                 }
3972             }
3973         }
3974     check_node_accept_bytes_match:
3975       if (!cset->non_match)
3976         return match_len;
3977       else
3978         {
3979           if (match_len > 0)
3980             return 0;
3981           else
3982             return (elem_len > char_len) ? elem_len : char_len;
3983         }
3984     }
3985   return 0;
3986 }
3987
3988 # ifdef _LIBC
3989 static unsigned int
3990 internal_function
3991 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3992 {
3993   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3994   if (nrules == 0)
3995     {
3996       if (mbs_len == 1)
3997         {
3998           /* No valid character.  Match it as a single byte character.  */
3999           const unsigned char *collseq = (const unsigned char *)
4000             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4001           return collseq[mbs[0]];
4002         }
4003       return UINT_MAX;
4004     }
4005   else
4006     {
4007       int32_t idx;
4008       const unsigned char *extra = (const unsigned char *)
4009         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4010       int32_t extrasize = (const unsigned char *)
4011         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4012
4013       for (idx = 0; idx < extrasize;)
4014         {
4015           int mbs_cnt, found = 0;
4016           int32_t elem_mbs_len;
4017           /* Skip the name of collating element name.  */
4018           idx = idx + extra[idx] + 1;
4019           elem_mbs_len = extra[idx++];
4020           if (mbs_len == elem_mbs_len)
4021             {
4022               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4023                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4024                   break;
4025               if (mbs_cnt == elem_mbs_len)
4026                 /* Found the entry.  */
4027                 found = 1;
4028             }
4029           /* Skip the byte sequence of the collating element.  */
4030           idx += elem_mbs_len;
4031           /* Adjust for the alignment.  */
4032           idx = (idx + 3) & ~3;
4033           /* Skip the collation sequence value.  */
4034           idx += sizeof (uint32_t);
4035           /* Skip the wide char sequence of the collating element.  */
4036           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4037           /* If we found the entry, return the sequence value.  */
4038           if (found)
4039             return *(uint32_t *) (extra + idx);
4040           /* Skip the collation sequence value.  */
4041           idx += sizeof (uint32_t);
4042         }
4043       return UINT_MAX;
4044     }
4045 }
4046 # endif /* _LIBC */
4047 #endif /* RE_ENABLE_I18N */
4048
4049 /* Check whether the node accepts the byte which is IDX-th
4050    byte of the INPUT.  */
4051
4052 static int
4053 internal_function
4054 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4055                    int idx)
4056 {
4057   unsigned char ch;
4058   ch = re_string_byte_at (&mctx->input, idx);
4059   switch (node->type)
4060     {
4061     case CHARACTER:
4062       if (node->opr.c != ch)
4063         return 0;
4064       break;
4065
4066     case SIMPLE_BRACKET:
4067       if (!bitset_contain (node->opr.sbcset, ch))
4068         return 0;
4069       break;
4070
4071 #ifdef RE_ENABLE_I18N
4072     case OP_UTF8_PERIOD:
4073       if (ch >= 0x80)
4074         return 0;
4075       /* FALLTHROUGH */
4076 #endif
4077     case OP_PERIOD:
4078       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4079           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4080         return 0;
4081       break;
4082
4083     default:
4084       return 0;
4085     }
4086
4087   if (node->constraint)
4088     {
4089       /* The node has constraints.  Check whether the current context
4090          satisfies the constraints.  */
4091       unsigned int context = re_string_context_at (&mctx->input, idx,
4092                                                    mctx->eflags);
4093       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4094         return 0;
4095     }
4096
4097   return 1;
4098 }
4099
4100 /* Extend the buffers, if the buffers have run out.  */
4101
4102 static reg_errcode_t
4103 internal_function
4104 extend_buffers (re_match_context_t *mctx)
4105 {
4106   reg_errcode_t ret;
4107   re_string_t *pstr = &mctx->input;
4108
4109   /* Avoid overflow.  */
4110   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4111     return REG_ESPACE;
4112
4113   /* Double the lengthes of the buffers.  */
4114   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4115   if (BE (ret != REG_NOERROR, 0))
4116     return ret;
4117
4118   if (mctx->state_log != NULL)
4119     {
4120       /* And double the length of state_log.  */
4121       /* XXX We have no indication of the size of this buffer.  If this
4122          allocation fail we have no indication that the state_log array
4123          does not have the right size.  */
4124       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4125                                               pstr->bufs_len + 1);
4126       if (BE (new_array == NULL, 0))
4127         return REG_ESPACE;
4128       mctx->state_log = new_array;
4129     }
4130
4131   /* Then reconstruct the buffers.  */
4132   if (pstr->icase)
4133     {
4134 #ifdef RE_ENABLE_I18N
4135       if (pstr->mb_cur_max > 1)
4136         {
4137           ret = build_wcs_upper_buffer (pstr);
4138           if (BE (ret != REG_NOERROR, 0))
4139             return ret;
4140         }
4141       else
4142 #endif /* RE_ENABLE_I18N  */
4143         build_upper_buffer (pstr);
4144     }
4145   else
4146     {
4147 #ifdef RE_ENABLE_I18N
4148       if (pstr->mb_cur_max > 1)
4149         build_wcs_buffer (pstr);
4150       else
4151 #endif /* RE_ENABLE_I18N  */
4152         {
4153           if (pstr->trans != NULL)
4154             re_string_translate_buffer (pstr);
4155         }
4156     }
4157   return REG_NOERROR;
4158 }
4159
4160 \f
4161 /* Functions for matching context.  */
4162
4163 /* Initialize MCTX.  */
4164
4165 static reg_errcode_t
4166 internal_function
4167 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4168 {
4169   mctx->eflags = eflags;
4170   mctx->match_last = -1;
4171   if (n > 0)
4172     {
4173       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4174       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4175       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4176         return REG_ESPACE;
4177     }
4178   /* Already zero-ed by the caller.
4179      else
4180        mctx->bkref_ents = NULL;
4181      mctx->nbkref_ents = 0;
4182      mctx->nsub_tops = 0;  */
4183   mctx->abkref_ents = n;
4184   mctx->max_mb_elem_len = 1;
4185   mctx->asub_tops = n;
4186   return REG_NOERROR;
4187 }
4188
4189 /* Clean the entries which depend on the current input in MCTX.
4190    This function must be invoked when the matcher changes the start index
4191    of the input, or changes the input string.  */
4192
4193 static void
4194 internal_function
4195 match_ctx_clean (re_match_context_t *mctx)
4196 {
4197   int st_idx;
4198   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4199     {
4200       int sl_idx;
4201       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4202       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4203         {
4204           re_sub_match_last_t *last = top->lasts[sl_idx];
4205           re_free (last->path.array);
4206           re_free (last);
4207         }
4208       re_free (top->lasts);
4209       if (top->path)
4210         {
4211           re_free (top->path->array);
4212           re_free (top->path);
4213         }
4214       free (top);
4215     }
4216
4217   mctx->nsub_tops = 0;
4218   mctx->nbkref_ents = 0;
4219 }
4220
4221 /* Free all the memory associated with MCTX.  */
4222
4223 static void
4224 internal_function
4225 match_ctx_free (re_match_context_t *mctx)
4226 {
4227   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4228   match_ctx_clean (mctx);
4229   re_free (mctx->sub_tops);
4230   re_free (mctx->bkref_ents);
4231 }
4232
4233 /* Add a new backreference entry to MCTX.
4234    Note that we assume that caller never call this function with duplicate
4235    entry, and call with STR_IDX which isn't smaller than any existing entry.
4236 */
4237
4238 static reg_errcode_t
4239 internal_function
4240 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4241                      int to)
4242 {
4243   if (mctx->nbkref_ents >= mctx->abkref_ents)
4244     {
4245       struct re_backref_cache_entry* new_entry;
4246       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4247                               mctx->abkref_ents * 2);
4248       if (BE (new_entry == NULL, 0))
4249         {
4250           re_free (mctx->bkref_ents);
4251           return REG_ESPACE;
4252         }
4253       mctx->bkref_ents = new_entry;
4254       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4255               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4256       mctx->abkref_ents *= 2;
4257     }
4258   if (mctx->nbkref_ents > 0
4259       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4260     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4261
4262   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4263   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4264   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4265   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4266
4267   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4268      If bit N is clear, means that this entry won't epsilon-transition to
4269      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4270      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4271      such node.
4272
4273      A backreference does not epsilon-transition unless it is empty, so set
4274      to all zeros if FROM != TO.  */
4275   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4276     = (from == to ? ~0 : 0);
4277
4278   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4279   if (mctx->max_mb_elem_len < to - from)
4280     mctx->max_mb_elem_len = to - from;
4281   return REG_NOERROR;
4282 }
4283
4284 /* Search for the first entry which has the same str_idx, or -1 if none is
4285    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4286
4287 static int
4288 internal_function
4289 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4290 {
4291   int left, right, mid, last;
4292   last = right = mctx->nbkref_ents;
4293   for (left = 0; left < right;)
4294     {
4295       mid = (left + right) / 2;
4296       if (mctx->bkref_ents[mid].str_idx < str_idx)
4297         left = mid + 1;
4298       else
4299         right = mid;
4300     }
4301   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4302     return left;
4303   else
4304     return -1;
4305 }
4306
4307 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4308    at STR_IDX.  */
4309
4310 static reg_errcode_t
4311 internal_function
4312 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4313 {
4314 #ifdef DEBUG
4315   assert (mctx->sub_tops != NULL);
4316   assert (mctx->asub_tops > 0);
4317 #endif
4318   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4319     {
4320       int new_asub_tops = mctx->asub_tops * 2;
4321       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4322                                                    re_sub_match_top_t *,
4323                                                    new_asub_tops);
4324       if (BE (new_array == NULL, 0))
4325         return REG_ESPACE;
4326       mctx->sub_tops = new_array;
4327       mctx->asub_tops = new_asub_tops;
4328     }
4329   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4330   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4331     return REG_ESPACE;
4332   mctx->sub_tops[mctx->nsub_tops]->node = node;
4333   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4334   return REG_NOERROR;
4335 }
4336
4337 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4338    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4339
4340 static re_sub_match_last_t *
4341 internal_function
4342 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4343 {
4344   re_sub_match_last_t *new_entry;
4345   if (BE (subtop->nlasts == subtop->alasts, 0))
4346     {
4347       int new_alasts = 2 * subtop->alasts + 1;
4348       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4349                                                     re_sub_match_last_t *,
4350                                                     new_alasts);
4351       if (BE (new_array == NULL, 0))
4352         return NULL;
4353       subtop->lasts = new_array;
4354       subtop->alasts = new_alasts;
4355     }
4356   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4357   if (BE (new_entry != NULL, 1))
4358     {
4359       subtop->lasts[subtop->nlasts] = new_entry;
4360       new_entry->node = node;
4361       new_entry->str_idx = str_idx;
4362       ++subtop->nlasts;
4363     }
4364   return new_entry;
4365 }
4366
4367 static void
4368 internal_function
4369 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4370                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4371 {
4372   sctx->sifted_states = sifted_sts;
4373   sctx->limited_states = limited_sts;
4374   sctx->last_node = last_node;
4375   sctx->last_str_idx = last_str_idx;
4376   re_node_set_init_empty (&sctx->limits);
4377 }