compat/regex/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19    02110-1301 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  56      internal_function;
  57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  58                            int *p_match_first) internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes)
  69      internal_function;
  70 static reg_errcode_t set_regs (const regex_t *preg,
  71                                const re_match_context_t *mctx,
  72                                size_t nmatch, regmatch_t *pmatch,
  73                                int fl_backtrack) internal_function;
  74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  75      internal_function;
  76
  77 #ifdef RE_ENABLE_I18N
  78 static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                 re_sift_context_t *sctx,
  80                                 int node_idx, int str_idx, int max_str_idx)
  81      internal_function;
  82 #endif /* RE_ENABLE_I18N */
  83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  84                                            re_sift_context_t *sctx)
  85      internal_function;
  86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  87                                           re_sift_context_t *sctx, int str_idx,
  88                                           re_node_set *cur_dest)
  89      internal_function;
  90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  91                                               re_sift_context_t *sctx,
  92                                               int str_idx,
  93                                               re_node_set *dest_nodes)
  94      internal_function;
  95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  96                                             re_node_set *dest_nodes,
  97                                             const re_node_set *candidates)
  98      internal_function;
  99 static int check_dst_limits (const re_match_context_t *mctx,
 100                              re_node_set *limits,
 101                              int dst_node, int dst_idx, int src_node,
 102                              int src_idx) internal_function;
 103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 104                                         int boundaries, int subexp_idx,
 105                                         int from_node, int bkref_idx)
 106      internal_function;
 107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 108                                       int limit, int subexp_idx,
 109                                       int node, int str_idx,
 110                                       int bkref_idx) internal_function;
 111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 112                                           re_node_set *dest_nodes,
 113                                           const re_node_set *candidates,
 114                                           re_node_set *limits,
 115                                           struct re_backref_cache_entry *bkref_ents,
 116                                           int str_idx) internal_function;
 117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 118                                         re_sift_context_t *sctx,
 119                                         int str_idx, const re_node_set *candidates)
 120      internal_function;
 121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 122                                         re_dfastate_t **dst,
 123                                         re_dfastate_t **src, int num)
 124      internal_function;
 125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 126                                          re_match_context_t *mctx) internal_function;
 127 static re_dfastate_t *transit_state (reg_errcode_t *err,
 128                                      re_match_context_t *mctx,
 129                                      re_dfastate_t *state) internal_function;
 130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 131                                             re_match_context_t *mctx,
 132                                             re_dfastate_t *next_state)
 133      internal_function;
 134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                 re_node_set *cur_nodes,
 136                                                 int str_idx) internal_function;
 137 #if 0
 138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                         re_match_context_t *mctx,
 140                                         re_dfastate_t *pstate)
 141      internal_function;
 142 #endif
 143 #ifdef RE_ENABLE_I18N
 144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 145                                        re_dfastate_t *pstate)
 146      internal_function;
 147 #endif /* RE_ENABLE_I18N */
 148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 149                                           const re_node_set *nodes)
 150      internal_function;
 151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                  int bkref_node, int bkref_str_idx)
 153      internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str)
 158      internal_function;
 159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 160                              int subexp_idx, int type) internal_function;
 161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 162                                     state_array_t *path, int top_node,
 163                                     int top_str, int last_node, int last_str,
 164                                     int type) internal_function;
 165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 166                                                    int str_idx,
 167                                                    re_node_set *cur_nodes,
 168                                                    re_node_set *next_nodes)
 169      internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 171                                                re_node_set *cur_nodes,
 172                                                int ex_subexp, int type)
 173      internal_function;
 174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 175                                                    re_node_set *dst_nodes,
 176                                                    int target, int ex_subexp,
 177                                                    int type) internal_function;
 178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 179                                          re_node_set *cur_nodes, int cur_str,
 180                                          int subexp_num, int type)
 181      internal_function;
 182 static int build_trtable (const re_dfa_t *dfa,
 183                           re_dfastate_t *state) internal_function;
 184 #ifdef RE_ENABLE_I18N
 185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 186                                     const re_string_t *input, int idx)
 187      internal_function;
 188 # ifdef _LIBC
 189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 190                                                    size_t name_len)
 191      internal_function;
 192 # endif /* _LIBC */
 193 #endif /* RE_ENABLE_I18N */
 194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 195                                        const re_dfastate_t *state,
 196                                        re_node_set *states_node,
 197                                        bitset_t *states_ch) internal_function;
 198 static int check_node_accept (const re_match_context_t *mctx,
 199                               const re_token_t *node, int idx)
 200      internal_function;
 201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 202      internal_function;
 203 \f
 204 /* Entry point for POSIX code.  */
 205
 206 /* regexec searches for a given pattern, specified by PREG, in the
 207    string STRING.
 208
 209    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 210    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 211    least NMATCH elements, and we set them to the offsets of the
 212    corresponding matched substrings.
 213
 214    EFLAGS specifies `execution flags' which affect matching: if
 215    REG_NOTBOL is set, then ^ does not match at the beginning of the
 216    string; if REG_NOTEOL is set, then $ does not match at the end.
 217
 218    We return 0 if we find a match and REG_NOMATCH if not.  */
 219
 220 int
 221 regexec (preg, string, nmatch, pmatch, eflags)
 222     const regex_t *__restrict preg;
 223     const char *__restrict string;
 224     size_t nmatch;
 225     regmatch_t pmatch[];
 226     int eflags;
 227 {
 228   reg_errcode_t err;
 229   int start, length;
 230
 231   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 232     return REG_BADPAT;
 233
 234   if (eflags & REG_STARTEND)
 235     {
 236       start = pmatch[0].rm_so;
 237       length = pmatch[0].rm_eo;
 238     }
 239   else
 240     {
 241       start = 0;
 242       length = strlen (string);
 243     }
 244
 245   __libc_lock_lock (dfa->lock);
 246   if (preg->no_sub)
 247     err = re_search_internal (preg, string, length, start, length - start,
 248                               length, 0, NULL, eflags);
 249   else
 250     err = re_search_internal (preg, string, length, start, length - start,
 251                               length, nmatch, pmatch, eflags);
 252   __libc_lock_unlock (dfa->lock);
 253   return err != REG_NOERROR;
 254 }
 255
 256 #ifdef _LIBC
 257 # include <shlib-compat.h>
 258 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 259
 260 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 261 __typeof__ (__regexec) __compat_regexec;
 262
 263 int
 264 attribute_compat_text_section
 265 __compat_regexec (const regex_t *__restrict preg,
 266                   const char *__restrict string, size_t nmatch,
 267                   regmatch_t pmatch[], int eflags)
 268 {
 269   return regexec (preg, string, nmatch, pmatch,
 270                   eflags & (REG_NOTBOL | REG_NOTEOL));
 271 }
 272 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 273 # endif
 274 #endif
 275
 276 /* Entry points for GNU code.  */
 277
 278 /* re_match, re_search, re_match_2, re_search_2
 279
 280    The former two functions operate on STRING with length LENGTH,
 281    while the later two operate on concatenation of STRING1 and STRING2
 282    with lengths LENGTH1 and LENGTH2, respectively.
 283
 284    re_match() matches the compiled pattern in BUFP against the string,
 285    starting at index START.
 286
 287    re_search() first tries matching at index START, then it tries to match
 288    starting from index START + 1, and so on.  The last start position tried
 289    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 290    way as re_match().)
 291
 292    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 293    the first STOP characters of the concatenation of the strings should be
 294    concerned.
 295
 296    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 297    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 298    computed relative to the concatenation, not relative to the individual
 299    strings.)
 300
 301    On success, re_match* functions return the length of the match, re_search*
 302    return the position of the start of the match.  Return value -1 means no
 303    match was found and -2 indicates an internal error.  */
 304
 305 int
 306 re_match (bufp, string, length, start, regs)
 307     struct re_pattern_buffer *bufp;
 308     const char *string;
 309     int length, start;
 310     struct re_registers *regs;
 311 {
 312   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 313 }
 314 #ifdef _LIBC
 315 weak_alias (__re_match, re_match)
 316 #endif
 317
 318 int
 319 re_search (bufp, string, length, start, range, regs)
 320     struct re_pattern_buffer *bufp;
 321     const char *string;
 322     int length, start, range;
 323     struct re_registers *regs;
 324 {
 325   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 326 }
 327 #ifdef _LIBC
 328 weak_alias (__re_search, re_search)
 329 #endif
 330
 331 int
 332 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 333     struct re_pattern_buffer *bufp;
 334     const char *string1, *string2;
 335     int length1, length2, start, stop;
 336     struct re_registers *regs;
 337 {
 338   return re_search_2_stub (bufp, string1, length1, string2, length2,
 339                            start, 0, regs, stop, 1);
 340 }
 341 #ifdef _LIBC
 342 weak_alias (__re_match_2, re_match_2)
 343 #endif
 344
 345 int
 346 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 347     struct re_pattern_buffer *bufp;
 348     const char *string1, *string2;
 349     int length1, length2, start, range, stop;
 350     struct re_registers *regs;
 351 {
 352   return re_search_2_stub (bufp, string1, length1, string2, length2,
 353                            start, range, regs, stop, 0);
 354 }
 355 #ifdef _LIBC
 356 weak_alias (__re_search_2, re_search_2)
 357 #endif
 358
 359 static int
 360 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 361                   stop, ret_len)
 362     struct re_pattern_buffer *bufp;
 363     const char *string1, *string2;
 364     int length1, length2, start, range, stop, ret_len;
 365     struct re_registers *regs;
 366 {
 367   const char *str;
 368   int rval;
 369   int len = length1 + length2;
 370   int free_str = 0;
 371
 372   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 373     return -2;
 374
 375   /* Concatenate the strings.  */
 376   if (length2 > 0)
 377     if (length1 > 0)
 378       {
 379         char *s = re_malloc (char, len);
 380
 381         if (BE (s == NULL, 0))
 382           return -2;
 383         memcpy (s, string1, length1);
 384         memcpy (s + length1, string2, length2);
 385         str = s;
 386         free_str = 1;
 387       }
 388     else
 389       str = string2;
 390   else
 391     str = string1;
 392
 393   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 394   if (free_str)
 395     re_free ((char *) str);
 396   return rval;
 397 }
 398
 399 /* The parameters have the same meaning as those of re_search.
 400    Additional parameters:
 401    If RET_LEN is nonzero the length of the match is returned (re_match style);
 402    otherwise the position of the match is returned.  */
 403
 404 static int
 405 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 406     struct re_pattern_buffer *bufp;
 407     const char *string;
 408     int length, start, range, stop, ret_len;
 409     struct re_registers *regs;
 410 {
 411   reg_errcode_t result;
 412   regmatch_t *pmatch;
 413   int nregs, rval;
 414   int eflags = 0;
 415
 416   /* Check for out-of-range.  */
 417   if (BE (start < 0 || start > length, 0))
 418     return -1;
 419   if (BE (start + range > length, 0))
 420     range = length - start;
 421   else if (BE (start + range < 0, 0))
 422     range = -start;
 423
 424   __libc_lock_lock (dfa->lock);
 425
 426   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 427   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 428
 429   /* Compile fastmap if we haven't yet.  */
 430   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 431     re_compile_fastmap (bufp);
 432
 433   if (BE (bufp->no_sub, 0))
 434     regs = NULL;
 435
 436   /* We need at least 1 register.  */
 437   if (regs == NULL)
 438     nregs = 1;
 439   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 440                regs->num_regs < bufp->re_nsub + 1, 0))
 441     {
 442       nregs = regs->num_regs;
 443       if (BE (nregs < 1, 0))
 444         {
 445           /* Nothing can be copied to regs.  */
 446           regs = NULL;
 447           nregs = 1;
 448         }
 449     }
 450   else
 451     nregs = bufp->re_nsub + 1;
 452   pmatch = re_malloc (regmatch_t, nregs);
 453   if (BE (pmatch == NULL, 0))
 454     {
 455       rval = -2;
 456       goto out;
 457     }
 458
 459   result = re_search_internal (bufp, string, length, start, range, stop,
 460                                nregs, pmatch, eflags);
 461
 462   rval = 0;
 463
 464   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 465   if (result != REG_NOERROR)
 466     rval = -1;
 467   else if (regs != NULL)
 468     {
 469       /* If caller wants register contents data back, copy them.  */
 470       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 471                                            bufp->regs_allocated);
 472       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 473         rval = -2;
 474     }
 475
 476   if (BE (rval == 0, 1))
 477     {
 478       if (ret_len)
 479         {
 480           assert (pmatch[0].rm_so == start);
 481           rval = pmatch[0].rm_eo - start;
 482         }
 483       else
 484         rval = pmatch[0].rm_so;
 485     }
 486   re_free (pmatch);
 487  out:
 488   __libc_lock_unlock (dfa->lock);
 489   return rval;
 490 }
 491
 492 static unsigned
 493 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 494     struct re_registers *regs;
 495     regmatch_t *pmatch;
 496     int nregs, regs_allocated;
 497 {
 498   int rval = REGS_REALLOCATE;
 499   int i;
 500   int need_regs = nregs + 1;
 501   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 502      uses.  */
 503
 504   /* Have the register data arrays been allocated?  */
 505   if (regs_allocated == REGS_UNALLOCATED)
 506     { /* No.  So allocate them with malloc.  */
 507       regs->start = re_malloc (regoff_t, need_regs);
 508       if (BE (regs->start == NULL, 0))
 509         return REGS_UNALLOCATED;
 510       regs->end = re_malloc (regoff_t, need_regs);
 511       if (BE (regs->end == NULL, 0))
 512         {
 513           re_free (regs->start);
 514           return REGS_UNALLOCATED;
 515         }
 516       regs->num_regs = need_regs;
 517     }
 518   else if (regs_allocated == REGS_REALLOCATE)
 519     { /* Yes.  If we need more elements than were already
 520          allocated, reallocate them.  If we need fewer, just
 521          leave it alone.  */
 522       if (BE (need_regs > regs->num_regs, 0))
 523         {
 524           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 525           regoff_t *new_end;
 526           if (BE (new_start == NULL, 0))
 527             return REGS_UNALLOCATED;
 528           new_end = re_realloc (regs->end, regoff_t, need_regs);
 529           if (BE (new_end == NULL, 0))
 530             {
 531               re_free (new_start);
 532               return REGS_UNALLOCATED;
 533             }
 534           regs->start = new_start;
 535           regs->end = new_end;
 536           regs->num_regs = need_regs;
 537         }
 538     }
 539   else
 540     {
 541       assert (regs_allocated == REGS_FIXED);
 542       /* This function may not be called with REGS_FIXED and nregs too big.  */
 543       assert (regs->num_regs >= nregs);
 544       rval = REGS_FIXED;
 545     }
 546
 547   /* Copy the regs.  */
 548   for (i = 0; i < nregs; ++i)
 549     {
 550       regs->start[i] = pmatch[i].rm_so;
 551       regs->end[i] = pmatch[i].rm_eo;
 552     }
 553   for ( ; i < regs->num_regs; ++i)
 554     regs->start[i] = regs->end[i] = -1;
 555
 556   return rval;
 557 }
 558
 559 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 560    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 561    this memory for recording register information.  STARTS and ENDS
 562    must be allocated using the malloc library routine, and must each
 563    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 564
 565    If NUM_REGS == 0, then subsequent matches should allocate their own
 566    register data.
 567
 568    Unless this function is called, the first search or match using
 569    PATTERN_BUFFER will allocate its own register data, without
 570    freeing the old data.  */
 571
 572 void
 573 re_set_registers (bufp, regs, num_regs, starts, ends)
 574     struct re_pattern_buffer *bufp;
 575     struct re_registers *regs;
 576     unsigned num_regs;
 577     regoff_t *starts, *ends;
 578 {
 579   if (num_regs)
 580     {
 581       bufp->regs_allocated = REGS_REALLOCATE;
 582       regs->num_regs = num_regs;
 583       regs->start = starts;
 584       regs->end = ends;
 585     }
 586   else
 587     {
 588       bufp->regs_allocated = REGS_UNALLOCATED;
 589       regs->num_regs = 0;
 590       regs->start = regs->end = (regoff_t *) 0;
 591     }
 592 }
 593 #ifdef _LIBC
 594 weak_alias (__re_set_registers, re_set_registers)
 595 #endif
 596 \f
 597 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 598    them unless specifically requested.  */
 599
 600 #if defined _REGEX_RE_COMP || defined _LIBC
 601 int
 602 # ifdef _LIBC
 603 weak_function
 604 # endif
 605 re_exec (s)
 606      const char *s;
 607 {
 608   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 609 }
 610 #endif /* _REGEX_RE_COMP */
 611 \f
 612 /* Internal entry point.  */
 613
 614 /* Searches for a compiled pattern PREG in the string STRING, whose
 615    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 616    mingings with regexec.  START, and RANGE have the same meanings
 617    with re_search.
 618    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 619    otherwise return the error code.
 620    Note: We assume front end functions already check ranges.
 621    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 622
 623 static reg_errcode_t
 624 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 625                     eflags)
 626     const regex_t *preg;
 627     const char *string;
 628     int length, start, range, stop, eflags;
 629     size_t nmatch;
 630     regmatch_t pmatch[];
 631 {
 632   reg_errcode_t err;
 633   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 634   int left_lim, right_lim, incr;
 635   int fl_longest_match, match_first, match_kind, match_last = -1;
 636   int extra_nmatch;
 637   int sb, ch;
 638 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 639   re_match_context_t mctx = { .dfa = dfa };
 640 #else
 641   re_match_context_t mctx;
 642 #endif
 643   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 644                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 645   RE_TRANSLATE_TYPE t = preg->translate;
 646
 647 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 648   memset (&mctx, '\0', sizeof (re_match_context_t));
 649   mctx.dfa = dfa;
 650 #endif
 651
 652   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 653   nmatch -= extra_nmatch;
 654
 655   /* Check if the DFA haven't been compiled.  */
 656   if (BE (preg->used == 0 || dfa->init_state == NULL
 657           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 658           || dfa->init_state_begbuf == NULL, 0))
 659     return REG_NOMATCH;
 660
 661 #ifdef DEBUG
 662   /* We assume front-end functions already check them.  */
 663   assert (start + range >= 0 && start + range <= length);
 664 #endif
 665
 666   /* If initial states with non-begbuf contexts have no elements,
 667      the regex must be anchored.  If preg->newline_anchor is set,
 668      we'll never use init_state_nl, so do not check it.  */
 669   if (dfa->init_state->nodes.nelem == 0
 670       && dfa->init_state_word->nodes.nelem == 0
 671       && (dfa->init_state_nl->nodes.nelem == 0
 672           || !preg->newline_anchor))
 673     {
 674       if (start != 0 && start + range != 0)
 675         return REG_NOMATCH;
 676       start = range = 0;
 677     }
 678
 679   /* We must check the longest matching, if nmatch > 0.  */
 680   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 681
 682   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 683                             preg->translate, preg->syntax & RE_ICASE, dfa);
 684   if (BE (err != REG_NOERROR, 0))
 685     goto free_return;
 686   mctx.input.stop = stop;
 687   mctx.input.raw_stop = stop;
 688   mctx.input.newline_anchor = preg->newline_anchor;
 689
 690   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 691   if (BE (err != REG_NOERROR, 0))
 692     goto free_return;
 693
 694   /* We will log all the DFA states through which the dfa pass,
 695      if nmatch > 1, or this dfa has "multibyte node", which is a
 696      back-reference or a node which can accept multibyte character or
 697      multi character collating element.  */
 698   if (nmatch > 1 || dfa->has_mb_node)
 699     {
 700       /* Avoid overflow.  */
 701       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 702         {
 703           err = REG_ESPACE;
 704           goto free_return;
 705         }
 706
 707       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 708       if (BE (mctx.state_log == NULL, 0))
 709         {
 710           err = REG_ESPACE;
 711           goto free_return;
 712         }
 713     }
 714   else
 715     mctx.state_log = NULL;
 716
 717   match_first = start;
 718   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 719                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 720
 721   /* Check incrementally whether of not the input string match.  */
 722   incr = (range < 0) ? -1 : 1;
 723   left_lim = (range < 0) ? start + range : start;
 724   right_lim = (range < 0) ? start : start + range;
 725   sb = dfa->mb_cur_max == 1;
 726   match_kind =
 727     (fastmap
 728      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 729         | (range >= 0 ? 2 : 0)
 730         | (t != NULL ? 1 : 0))
 731      : 8);
 732
 733   for (;; match_first += incr)
 734     {
 735       err = REG_NOMATCH;
 736       if (match_first < left_lim || right_lim < match_first)
 737         goto free_return;
 738
 739       /* Advance as rapidly as possible through the string, until we
 740          find a plausible place to start matching.  This may be done
 741          with varying efficiency, so there are various possibilities:
 742          only the most common of them are specialized, in order to
 743          save on code size.  We use a switch statement for speed.  */
 744       switch (match_kind)
 745         {
 746         case 8:
 747           /* No fastmap.  */
 748           break;
 749
 750         case 7:
 751           /* Fastmap with single-byte translation, match forward.  */
 752           while (BE (match_first < right_lim, 1)
 753                  && !fastmap[t[(unsigned char) string[match_first]]])
 754             ++match_first;
 755           goto forward_match_found_start_or_reached_end;
 756
 757         case 6:
 758           /* Fastmap without translation, match forward.  */
 759           while (BE (match_first < right_lim, 1)
 760                  && !fastmap[(unsigned char) string[match_first]])
 761             ++match_first;
 762
 763         forward_match_found_start_or_reached_end:
 764           if (BE (match_first == right_lim, 0))
 765             {
 766               ch = match_first >= length
 767                        ? 0 : (unsigned char) string[match_first];
 768               if (!fastmap[t ? t[ch] : ch])
 769                 goto free_return;
 770             }
 771           break;
 772
 773         case 4:
 774         case 5:
 775           /* Fastmap without multi-byte translation, match backwards.  */
 776           while (match_first >= left_lim)
 777             {
 778               ch = match_first >= length
 779                        ? 0 : (unsigned char) string[match_first];
 780               if (fastmap[t ? t[ch] : ch])
 781                 break;
 782               --match_first;
 783             }
 784           if (match_first < left_lim)
 785             goto free_return;
 786           break;
 787
 788         default:
 789           /* In this case, we can't determine easily the current byte,
 790              since it might be a component byte of a multibyte
 791              character.  Then we use the constructed buffer instead.  */
 792           for (;;)
 793             {
 794               /* If MATCH_FIRST is out of the valid range, reconstruct the
 795                  buffers.  */
 796               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 797               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 798                 {
 799                   err = re_string_reconstruct (&mctx.input, match_first,
 800                                                eflags);
 801                   if (BE (err != REG_NOERROR, 0))
 802                     goto free_return;
 803
 804                   offset = match_first - mctx.input.raw_mbs_idx;
 805                 }
 806               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 807                  Note that MATCH_FIRST must not be smaller than 0.  */
 808               ch = (match_first >= length
 809                     ? 0 : re_string_byte_at (&mctx.input, offset));
 810               if (fastmap[ch])
 811                 break;
 812               match_first += incr;
 813               if (match_first < left_lim || match_first > right_lim)
 814                 {
 815                   err = REG_NOMATCH;
 816                   goto free_return;
 817                 }
 818             }
 819           break;
 820         }
 821
 822       /* Reconstruct the buffers so that the matcher can assume that
 823          the matching starts from the beginning of the buffer.  */
 824       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 825       if (BE (err != REG_NOERROR, 0))
 826         goto free_return;
 827
 828 #ifdef RE_ENABLE_I18N
 829      /* Don't consider this char as a possible match start if it part,
 830         yet isn't the head, of a multibyte character.  */
 831       if (!sb && !re_string_first_byte (&mctx.input, 0))
 832         continue;
 833 #endif
 834
 835       /* It seems to be appropriate one, then use the matcher.  */
 836       /* We assume that the matching starts from 0.  */
 837       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 838       match_last = check_matching (&mctx, fl_longest_match,
 839                                    range >= 0 ? &match_first : NULL);
 840       if (match_last != -1)
 841         {
 842           if (BE (match_last == -2, 0))
 843             {
 844               err = REG_ESPACE;
 845               goto free_return;
 846             }
 847           else
 848             {
 849               mctx.match_last = match_last;
 850               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 851                 {
 852                   re_dfastate_t *pstate = mctx.state_log[match_last];
 853                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 854                                                              match_last);
 855                 }
 856               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 857                   || dfa->nbackref)
 858                 {
 859                   err = prune_impossible_nodes (&mctx);
 860                   if (err == REG_NOERROR)
 861                     break;
 862                   if (BE (err != REG_NOMATCH, 0))
 863                     goto free_return;
 864                   match_last = -1;
 865                 }
 866               else
 867                 break; /* We found a match.  */
 868             }
 869         }
 870
 871       match_ctx_clean (&mctx);
 872     }
 873
 874 #ifdef DEBUG
 875   assert (match_last != -1);
 876   assert (err == REG_NOERROR);
 877 #endif
 878
 879   /* Set pmatch[] if we need.  */
 880   if (nmatch > 0)
 881     {
 882       int reg_idx;
 883
 884       /* Initialize registers.  */
 885       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 886         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 887
 888       /* Set the points where matching start/end.  */
 889       pmatch[0].rm_so = 0;
 890       pmatch[0].rm_eo = mctx.match_last;
 891
 892       if (!preg->no_sub && nmatch > 1)
 893         {
 894           err = set_regs (preg, &mctx, nmatch, pmatch,
 895                           dfa->has_plural_match && dfa->nbackref > 0);
 896           if (BE (err != REG_NOERROR, 0))
 897             goto free_return;
 898         }
 899
 900       /* At last, add the offset to the each registers, since we slided
 901          the buffers so that we could assume that the matching starts
 902          from 0.  */
 903       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 904         if (pmatch[reg_idx].rm_so != -1)
 905           {
 906 #ifdef RE_ENABLE_I18N
 907             if (BE (mctx.input.offsets_needed != 0, 0))
 908               {
 909                 pmatch[reg_idx].rm_so =
 910                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 911                    ? mctx.input.valid_raw_len
 912                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 913                 pmatch[reg_idx].rm_eo =
 914                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 915                    ? mctx.input.valid_raw_len
 916                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 917               }
 918 #else
 919             assert (mctx.input.offsets_needed == 0);
 920 #endif
 921             pmatch[reg_idx].rm_so += match_first;
 922             pmatch[reg_idx].rm_eo += match_first;
 923           }
 924       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 925         {
 926           pmatch[nmatch + reg_idx].rm_so = -1;
 927           pmatch[nmatch + reg_idx].rm_eo = -1;
 928         }
 929
 930       if (dfa->subexp_map)
 931         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 932           if (dfa->subexp_map[reg_idx] != reg_idx)
 933             {
 934               pmatch[reg_idx + 1].rm_so
 935                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 936               pmatch[reg_idx + 1].rm_eo
 937                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 938             }
 939     }
 940
 941  free_return:
 942   re_free (mctx.state_log);
 943   if (dfa->nbackref)
 944     match_ctx_free (&mctx);
 945   re_string_destruct (&mctx.input);
 946   return err;
 947 }
 948
 949 static reg_errcode_t
 950 prune_impossible_nodes (mctx)
 951      re_match_context_t *mctx;
 952 {
 953   const re_dfa_t *const dfa = mctx->dfa;
 954   int halt_node, match_last;
 955   reg_errcode_t ret;
 956   re_dfastate_t **sifted_states;
 957   re_dfastate_t **lim_states = NULL;
 958   re_sift_context_t sctx;
 959 #ifdef DEBUG
 960   assert (mctx->state_log != NULL);
 961 #endif
 962   match_last = mctx->match_last;
 963   halt_node = mctx->last_node;
 964
 965   /* Avoid overflow.  */
 966   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 967     return REG_ESPACE;
 968
 969   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 970   if (BE (sifted_states == NULL, 0))
 971     {
 972       ret = REG_ESPACE;
 973       goto free_return;
 974     }
 975   if (dfa->nbackref)
 976     {
 977       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 978       if (BE (lim_states == NULL, 0))
 979         {
 980           ret = REG_ESPACE;
 981           goto free_return;
 982         }
 983       while (1)
 984         {
 985           memset (lim_states, '\0',
 986                   sizeof (re_dfastate_t *) * (match_last + 1));
 987           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 988                          match_last);
 989           ret = sift_states_backward (mctx, &sctx);
 990           re_node_set_free (&sctx.limits);
 991           if (BE (ret != REG_NOERROR, 0))
 992               goto free_return;
 993           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 994             break;
 995           do
 996             {
 997               --match_last;
 998               if (match_last < 0)
 999                 {
1000                   ret = REG_NOMATCH;
1001                   goto free_return;
1002                 }
1003             } while (mctx->state_log[match_last] == NULL
1004                      || !mctx->state_log[match_last]->halt);
1005           halt_node = check_halt_state_context (mctx,
1006                                                 mctx->state_log[match_last],
1007                                                 match_last);
1008         }
1009       ret = merge_state_array (dfa, sifted_states, lim_states,
1010                                match_last + 1);
1011       re_free (lim_states);
1012       lim_states = NULL;
1013       if (BE (ret != REG_NOERROR, 0))
1014         goto free_return;
1015     }
1016   else
1017     {
1018       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1019       ret = sift_states_backward (mctx, &sctx);
1020       re_node_set_free (&sctx.limits);
1021       if (BE (ret != REG_NOERROR, 0))
1022         goto free_return;
1023       if (sifted_states[0] == NULL)
1024         {
1025           ret = REG_NOMATCH;
1026           goto free_return;
1027         }
1028     }
1029   re_free (mctx->state_log);
1030   mctx->state_log = sifted_states;
1031   sifted_states = NULL;
1032   mctx->last_node = halt_node;
1033   mctx->match_last = match_last;
1034   ret = REG_NOERROR;
1035  free_return:
1036   re_free (sifted_states);
1037   re_free (lim_states);
1038   return ret;
1039 }
1040
1041 /* Acquire an initial state and return it.
1042    We must select appropriate initial state depending on the context,
1043    since initial states may have constraints like "\<", "^", etc..  */
1044
1045 static inline re_dfastate_t *
1046 __attribute ((always_inline)) internal_function
1047 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1048                             int idx)
1049 {
1050   const re_dfa_t *const dfa = mctx->dfa;
1051   if (dfa->init_state->has_constraint)
1052     {
1053       unsigned int context;
1054       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1055       if (IS_WORD_CONTEXT (context))
1056         return dfa->init_state_word;
1057       else if (IS_ORDINARY_CONTEXT (context))
1058         return dfa->init_state;
1059       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1060         return dfa->init_state_begbuf;
1061       else if (IS_NEWLINE_CONTEXT (context))
1062         return dfa->init_state_nl;
1063       else if (IS_BEGBUF_CONTEXT (context))
1064         {
1065           /* It is relatively rare case, then calculate on demand.  */
1066           return re_acquire_state_context (err, dfa,
1067                                            dfa->init_state->entrance_nodes,
1068                                            context);
1069         }
1070       else
1071         /* Must not happen?  */
1072         return dfa->init_state;
1073     }
1074   else
1075     return dfa->init_state;
1076 }
1077
1078 /* Check whether the regular expression match input string INPUT or not,
1079    and return the index where the matching end, return -1 if not match,
1080    or return -2 in case of an error.
1081    FL_LONGEST_MATCH means we want the POSIX longest matching.
1082    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1083    next place where we may want to try matching.
1084    Note that the matcher assume that the maching starts from the current
1085    index of the buffer.  */
1086
1087 static int
1088 internal_function
1089 check_matching (re_match_context_t *mctx, int fl_longest_match,
1090                 int *p_match_first)
1091 {
1092   const re_dfa_t *const dfa = mctx->dfa;
1093   reg_errcode_t err;
1094   int match = 0;
1095   int match_last = -1;
1096   int cur_str_idx = re_string_cur_idx (&mctx->input);
1097   re_dfastate_t *cur_state;
1098   int at_init_state = p_match_first != NULL;
1099   int next_start_idx = cur_str_idx;
1100
1101   err = REG_NOERROR;
1102   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1103   /* An initial state must not be NULL (invalid).  */
1104   if (BE (cur_state == NULL, 0))
1105     {
1106       assert (err == REG_ESPACE);
1107       return -2;
1108     }
1109
1110   if (mctx->state_log != NULL)
1111     {
1112       mctx->state_log[cur_str_idx] = cur_state;
1113
1114       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1115          later.  E.g. Processing back references.  */
1116       if (BE (dfa->nbackref, 0))
1117         {
1118           at_init_state = 0;
1119           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1120           if (BE (err != REG_NOERROR, 0))
1121             return err;
1122
1123           if (cur_state->has_backref)
1124             {
1125               err = transit_state_bkref (mctx, &cur_state->nodes);
1126               if (BE (err != REG_NOERROR, 0))
1127                 return err;
1128             }
1129         }
1130     }
1131
1132   /* If the RE accepts NULL string.  */
1133   if (BE (cur_state->halt, 0))
1134     {
1135       if (!cur_state->has_constraint
1136           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1137         {
1138           if (!fl_longest_match)
1139             return cur_str_idx;
1140           else
1141             {
1142               match_last = cur_str_idx;
1143               match = 1;
1144             }
1145         }
1146     }
1147
1148   while (!re_string_eoi (&mctx->input))
1149     {
1150       re_dfastate_t *old_state = cur_state;
1151       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1152
1153       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1154           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1155               && mctx->input.valid_len < mctx->input.len))
1156         {
1157           err = extend_buffers (mctx);
1158           if (BE (err != REG_NOERROR, 0))
1159             {
1160               assert (err == REG_ESPACE);
1161               return -2;
1162             }
1163         }
1164
1165       cur_state = transit_state (&err, mctx, cur_state);
1166       if (mctx->state_log != NULL)
1167         cur_state = merge_state_with_log (&err, mctx, cur_state);
1168
1169       if (cur_state == NULL)
1170         {
1171           /* Reached the invalid state or an error.  Try to recover a valid
1172              state using the state log, if available and if we have not
1173              already found a valid (even if not the longest) match.  */
1174           if (BE (err != REG_NOERROR, 0))
1175             return -2;
1176
1177           if (mctx->state_log == NULL
1178               || (match && !fl_longest_match)
1179               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1180             break;
1181         }
1182
1183       if (BE (at_init_state, 0))
1184         {
1185           if (old_state == cur_state)
1186             next_start_idx = next_char_idx;
1187           else
1188             at_init_state = 0;
1189         }
1190
1191       if (cur_state->halt)
1192         {
1193           /* Reached a halt state.
1194              Check the halt state can satisfy the current context.  */
1195           if (!cur_state->has_constraint
1196               || check_halt_state_context (mctx, cur_state,
1197                                            re_string_cur_idx (&mctx->input)))
1198             {
1199               /* We found an appropriate halt state.  */
1200               match_last = re_string_cur_idx (&mctx->input);
1201               match = 1;
1202
1203               /* We found a match, do not modify match_first below.  */
1204               p_match_first = NULL;
1205               if (!fl_longest_match)
1206                 break;
1207             }
1208         }
1209     }
1210
1211   if (p_match_first)
1212     *p_match_first += next_start_idx;
1213
1214   return match_last;
1215 }
1216
1217 /* Check NODE match the current context.  */
1218
1219 static int
1220 internal_function
1221 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1222 {
1223   re_token_type_t type = dfa->nodes[node].type;
1224   unsigned int constraint = dfa->nodes[node].constraint;
1225   if (type != END_OF_RE)
1226     return 0;
1227   if (!constraint)
1228     return 1;
1229   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1230     return 0;
1231   return 1;
1232 }
1233
1234 /* Check the halt state STATE match the current context.
1235    Return 0 if not match, if the node, STATE has, is a halt node and
1236    match the context, return the node.  */
1237
1238 static int
1239 internal_function
1240 check_halt_state_context (const re_match_context_t *mctx,
1241                           const re_dfastate_t *state, int idx)
1242 {
1243   int i;
1244   unsigned int context;
1245 #ifdef DEBUG
1246   assert (state->halt);
1247 #endif
1248   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1249   for (i = 0; i < state->nodes.nelem; ++i)
1250     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1251       return state->nodes.elems[i];
1252   return 0;
1253 }
1254
1255 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1256    corresponding to the DFA).
1257    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1258    of errors.  */
1259
1260 static int
1261 internal_function
1262 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1263                    int *pidx, int node, re_node_set *eps_via_nodes,
1264                    struct re_fail_stack_t *fs)
1265 {
1266   const re_dfa_t *const dfa = mctx->dfa;
1267   int i, err;
1268   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1269     {
1270       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1271       re_node_set *edests = &dfa->edests[node];
1272       int dest_node;
1273       err = re_node_set_insert (eps_via_nodes, node);
1274       if (BE (err < 0, 0))
1275         return -2;
1276       /* Pick up a valid destination, or return -1 if none is found.  */
1277       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1278         {
1279           int candidate = edests->elems[i];
1280           if (!re_node_set_contains (cur_nodes, candidate))
1281             continue;
1282           if (dest_node == -1)
1283             dest_node = candidate;
1284
1285           else
1286             {
1287               /* In order to avoid infinite loop like "(a*)*", return the second
1288                  epsilon-transition if the first was already considered.  */
1289               if (re_node_set_contains (eps_via_nodes, dest_node))
1290                 return candidate;
1291
1292               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1293               else if (fs != NULL
1294                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1295                                            eps_via_nodes))
1296                 return -2;
1297
1298               /* We know we are going to exit.  */
1299               break;
1300             }
1301         }
1302       return dest_node;
1303     }
1304   else
1305     {
1306       int naccepted = 0;
1307       re_token_type_t type = dfa->nodes[node].type;
1308
1309 #ifdef RE_ENABLE_I18N
1310       if (dfa->nodes[node].accept_mb)
1311         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1312       else
1313 #endif /* RE_ENABLE_I18N */
1314       if (type == OP_BACK_REF)
1315         {
1316           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1317           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1318           if (fs != NULL)
1319             {
1320               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1321                 return -1;
1322               else if (naccepted)
1323                 {
1324                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1325                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1326                               naccepted) != 0)
1327                     return -1;
1328                 }
1329             }
1330
1331           if (naccepted == 0)
1332             {
1333               int dest_node;
1334               err = re_node_set_insert (eps_via_nodes, node);
1335               if (BE (err < 0, 0))
1336                 return -2;
1337               dest_node = dfa->edests[node].elems[0];
1338               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1339                                         dest_node))
1340                 return dest_node;
1341             }
1342         }
1343
1344       if (naccepted != 0
1345           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1346         {
1347           int dest_node = dfa->nexts[node];
1348           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1349           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1350                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1351                                                dest_node)))
1352             return -1;
1353           re_node_set_empty (eps_via_nodes);
1354           return dest_node;
1355         }
1356     }
1357   return -1;
1358 }
1359
1360 static reg_errcode_t
1361 internal_function
1362 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1363                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1364 {
1365   reg_errcode_t err;
1366   int num = fs->num++;
1367   if (fs->num == fs->alloc)
1368     {
1369       struct re_fail_stack_ent_t *new_array;
1370       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1371                                        * fs->alloc * 2));
1372       if (new_array == NULL)
1373         return REG_ESPACE;
1374       fs->alloc *= 2;
1375       fs->stack = new_array;
1376     }
1377   fs->stack[num].idx = str_idx;
1378   fs->stack[num].node = dest_node;
1379   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1380   if (fs->stack[num].regs == NULL)
1381     return REG_ESPACE;
1382   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1383   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1384   return err;
1385 }
1386
1387 static int
1388 internal_function
1389 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1390                 regmatch_t *regs, re_node_set *eps_via_nodes)
1391 {
1392   int num = --fs->num;
1393   assert (num >= 0);
1394   *pidx = fs->stack[num].idx;
1395   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1396   re_node_set_free (eps_via_nodes);
1397   re_free (fs->stack[num].regs);
1398   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1399   return fs->stack[num].node;
1400 }
1401
1402 /* Set the positions where the subexpressions are starts/ends to registers
1403    PMATCH.
1404    Note: We assume that pmatch[0] is already set, and
1405    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1406
1407 static reg_errcode_t
1408 internal_function
1409 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1410           regmatch_t *pmatch, int fl_backtrack)
1411 {
1412   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1413   int idx, cur_node;
1414   re_node_set eps_via_nodes;
1415   struct re_fail_stack_t *fs;
1416   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1417   regmatch_t *prev_idx_match;
1418   int prev_idx_match_malloced = 0;
1419
1420 #ifdef DEBUG
1421   assert (nmatch > 1);
1422   assert (mctx->state_log != NULL);
1423 #endif
1424   if (fl_backtrack)
1425     {
1426       fs = &fs_body;
1427       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1428       if (fs->stack == NULL)
1429         return REG_ESPACE;
1430     }
1431   else
1432     fs = NULL;
1433
1434   cur_node = dfa->init_node;
1435   re_node_set_init_empty (&eps_via_nodes);
1436
1437 #ifdef HAVE_ALLOCA
1438   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1439     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1440   else
1441 #endif
1442     {
1443       prev_idx_match = re_malloc (regmatch_t, nmatch);
1444       if (prev_idx_match == NULL)
1445         {
1446           free_fail_stack_return (fs);
1447           return REG_ESPACE;
1448         }
1449       prev_idx_match_malloced = 1;
1450     }
1451   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1452
1453   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1454     {
1455       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1456
1457       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1458         {
1459           int reg_idx;
1460           if (fs)
1461             {
1462               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1463                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1464                   break;
1465               if (reg_idx == nmatch)
1466                 {
1467                   re_node_set_free (&eps_via_nodes);
1468                   if (prev_idx_match_malloced)
1469                     re_free (prev_idx_match);
1470                   return free_fail_stack_return (fs);
1471                 }
1472               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1473                                          &eps_via_nodes);
1474             }
1475           else
1476             {
1477               re_node_set_free (&eps_via_nodes);
1478               if (prev_idx_match_malloced)
1479                 re_free (prev_idx_match);
1480               return REG_NOERROR;
1481             }
1482         }
1483
1484       /* Proceed to next node.  */
1485       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1486                                     &eps_via_nodes, fs);
1487
1488       if (BE (cur_node < 0, 0))
1489         {
1490           if (BE (cur_node == -2, 0))
1491             {
1492               re_node_set_free (&eps_via_nodes);
1493               if (prev_idx_match_malloced)
1494                 re_free (prev_idx_match);
1495               free_fail_stack_return (fs);
1496               return REG_ESPACE;
1497             }
1498           if (fs)
1499             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1500                                        &eps_via_nodes);
1501           else
1502             {
1503               re_node_set_free (&eps_via_nodes);
1504               if (prev_idx_match_malloced)
1505                 re_free (prev_idx_match);
1506               return REG_NOMATCH;
1507             }
1508         }
1509     }
1510   re_node_set_free (&eps_via_nodes);
1511   if (prev_idx_match_malloced)
1512     re_free (prev_idx_match);
1513   return free_fail_stack_return (fs);
1514 }
1515
1516 static reg_errcode_t
1517 internal_function
1518 free_fail_stack_return (struct re_fail_stack_t *fs)
1519 {
1520   if (fs)
1521     {
1522       int fs_idx;
1523       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1524         {
1525           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1526           re_free (fs->stack[fs_idx].regs);
1527         }
1528       re_free (fs->stack);
1529     }
1530   return REG_NOERROR;
1531 }
1532
1533 static void
1534 internal_function
1535 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1536              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1537 {
1538   int type = dfa->nodes[cur_node].type;
1539   if (type == OP_OPEN_SUBEXP)
1540     {
1541       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1542
1543       /* We are at the first node of this sub expression.  */
1544       if (reg_num < nmatch)
1545         {
1546           pmatch[reg_num].rm_so = cur_idx;
1547           pmatch[reg_num].rm_eo = -1;
1548         }
1549     }
1550   else if (type == OP_CLOSE_SUBEXP)
1551     {
1552       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1553       if (reg_num < nmatch)
1554         {
1555           /* We are at the last node of this sub expression.  */
1556           if (pmatch[reg_num].rm_so < cur_idx)
1557             {
1558               pmatch[reg_num].rm_eo = cur_idx;
1559               /* This is a non-empty match or we are not inside an optional
1560                  subexpression.  Accept this right away.  */
1561               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1562             }
1563           else
1564             {
1565               if (dfa->nodes[cur_node].opt_subexp
1566                   && prev_idx_match[reg_num].rm_so != -1)
1567                 /* We transited through an empty match for an optional
1568                    subexpression, like (a?)*, and this is not the subexp's
1569                    first match.  Copy back the old content of the registers
1570                    so that matches of an inner subexpression are undone as
1571                    well, like in ((a?))*.  */
1572                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1573               else
1574                 /* We completed a subexpression, but it may be part of
1575                    an optional one, so do not update PREV_IDX_MATCH.  */
1576                 pmatch[reg_num].rm_eo = cur_idx;
1577             }
1578         }
1579     }
1580 }
1581
1582 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1583    and sift the nodes in each states according to the following rules.
1584    Updated state_log will be wrote to STATE_LOG.
1585
1586    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1587      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1588         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1589         the LAST_NODE, we throw away the node `a'.
1590      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1591         string `s' and transit to `b':
1592         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1593            away the node `a'.
1594         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1595             thrown away, we throw away the node `a'.
1596      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1597         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1598            node `a'.
1599         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1600             we throw away the node `a'.  */
1601
1602 #define STATE_NODE_CONTAINS(state,node) \
1603   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1604
1605 static reg_errcode_t
1606 internal_function
1607 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1608 {
1609   reg_errcode_t err;
1610   int null_cnt = 0;
1611   int str_idx = sctx->last_str_idx;
1612   re_node_set cur_dest;
1613
1614 #ifdef DEBUG
1615   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1616 #endif
1617
1618   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1619      transit to the last_node and the last_node itself.  */
1620   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1621   if (BE (err != REG_NOERROR, 0))
1622     return err;
1623   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1624   if (BE (err != REG_NOERROR, 0))
1625     goto free_return;
1626
1627   /* Then check each states in the state_log.  */
1628   while (str_idx > 0)
1629     {
1630       /* Update counters.  */
1631       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1632       if (null_cnt > mctx->max_mb_elem_len)
1633         {
1634           memset (sctx->sifted_states, '\0',
1635                   sizeof (re_dfastate_t *) * str_idx);
1636           re_node_set_free (&cur_dest);
1637           return REG_NOERROR;
1638         }
1639       re_node_set_empty (&cur_dest);
1640       --str_idx;
1641
1642       if (mctx->state_log[str_idx])
1643         {
1644           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1645           if (BE (err != REG_NOERROR, 0))
1646             goto free_return;
1647         }
1648
1649       /* Add all the nodes which satisfy the following conditions:
1650          - It can epsilon transit to a node in CUR_DEST.
1651          - It is in CUR_SRC.
1652          And update state_log.  */
1653       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1654       if (BE (err != REG_NOERROR, 0))
1655         goto free_return;
1656     }
1657   err = REG_NOERROR;
1658  free_return:
1659   re_node_set_free (&cur_dest);
1660   return err;
1661 }
1662
1663 static reg_errcode_t
1664 internal_function
1665 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1666                      int str_idx, re_node_set *cur_dest)
1667 {
1668   const re_dfa_t *const dfa = mctx->dfa;
1669   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1670   int i;
1671
1672   /* Then build the next sifted state.
1673      We build the next sifted state on `cur_dest', and update
1674      `sifted_states[str_idx]' with `cur_dest'.
1675      Note:
1676      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1677      `cur_src' points the node_set of the old `state_log[str_idx]'
1678      (with the epsilon nodes pre-filtered out).  */
1679   for (i = 0; i < cur_src->nelem; i++)
1680     {
1681       int prev_node = cur_src->elems[i];
1682       int naccepted = 0;
1683       int ret;
1684
1685 #ifdef DEBUG
1686       re_token_type_t type = dfa->nodes[prev_node].type;
1687       assert (!IS_EPSILON_NODE (type));
1688 #endif
1689 #ifdef RE_ENABLE_I18N
1690       /* If the node may accept `multi byte'.  */
1691       if (dfa->nodes[prev_node].accept_mb)
1692         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1693                                          str_idx, sctx->last_str_idx);
1694 #endif /* RE_ENABLE_I18N */
1695
1696       /* We don't check backreferences here.
1697          See update_cur_sifted_state().  */
1698       if (!naccepted
1699           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1700           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1701                                   dfa->nexts[prev_node]))
1702         naccepted = 1;
1703
1704       if (naccepted == 0)
1705         continue;
1706
1707       if (sctx->limits.nelem)
1708         {
1709           int to_idx = str_idx + naccepted;
1710           if (check_dst_limits (mctx, &sctx->limits,
1711                                 dfa->nexts[prev_node], to_idx,
1712                                 prev_node, str_idx))
1713             continue;
1714         }
1715       ret = re_node_set_insert (cur_dest, prev_node);
1716       if (BE (ret == -1, 0))
1717         return REG_ESPACE;
1718     }
1719
1720   return REG_NOERROR;
1721 }
1722
1723 /* Helper functions.  */
1724
1725 static reg_errcode_t
1726 internal_function
1727 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1728 {
1729   int top = mctx->state_log_top;
1730
1731   if (next_state_log_idx >= mctx->input.bufs_len
1732       || (next_state_log_idx >= mctx->input.valid_len
1733           && mctx->input.valid_len < mctx->input.len))
1734     {
1735       reg_errcode_t err;
1736       err = extend_buffers (mctx);
1737       if (BE (err != REG_NOERROR, 0))
1738         return err;
1739     }
1740
1741   if (top < next_state_log_idx)
1742     {
1743       memset (mctx->state_log + top + 1, '\0',
1744               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1745       mctx->state_log_top = next_state_log_idx;
1746     }
1747   return REG_NOERROR;
1748 }
1749
1750 static reg_errcode_t
1751 internal_function
1752 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1753                    re_dfastate_t **src, int num)
1754 {
1755   int st_idx;
1756   reg_errcode_t err;
1757   for (st_idx = 0; st_idx < num; ++st_idx)
1758     {
1759       if (dst[st_idx] == NULL)
1760         dst[st_idx] = src[st_idx];
1761       else if (src[st_idx] != NULL)
1762         {
1763           re_node_set merged_set;
1764           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1765                                         &src[st_idx]->nodes);
1766           if (BE (err != REG_NOERROR, 0))
1767             return err;
1768           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1769           re_node_set_free (&merged_set);
1770           if (BE (err != REG_NOERROR, 0))
1771             return err;
1772         }
1773     }
1774   return REG_NOERROR;
1775 }
1776
1777 static reg_errcode_t
1778 internal_function
1779 update_cur_sifted_state (const re_match_context_t *mctx,
1780                          re_sift_context_t *sctx, int str_idx,
1781                          re_node_set *dest_nodes)
1782 {
1783   const re_dfa_t *const dfa = mctx->dfa;
1784   reg_errcode_t err = REG_NOERROR;
1785   const re_node_set *candidates;
1786   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1787                 : &mctx->state_log[str_idx]->nodes);
1788
1789   if (dest_nodes->nelem == 0)
1790     sctx->sifted_states[str_idx] = NULL;
1791   else
1792     {
1793       if (candidates)
1794         {
1795           /* At first, add the nodes which can epsilon transit to a node in
1796              DEST_NODE.  */
1797           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1798           if (BE (err != REG_NOERROR, 0))
1799             return err;
1800
1801           /* Then, check the limitations in the current sift_context.  */
1802           if (sctx->limits.nelem)
1803             {
1804               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1805                                          mctx->bkref_ents, str_idx);
1806               if (BE (err != REG_NOERROR, 0))
1807                 return err;
1808             }
1809         }
1810
1811       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1812       if (BE (err != REG_NOERROR, 0))
1813         return err;
1814     }
1815
1816   if (candidates && mctx->state_log[str_idx]->has_backref)
1817     {
1818       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1819       if (BE (err != REG_NOERROR, 0))
1820         return err;
1821     }
1822   return REG_NOERROR;
1823 }
1824
1825 static reg_errcode_t
1826 internal_function
1827 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1828                        const re_node_set *candidates)
1829 {
1830   reg_errcode_t err = REG_NOERROR;
1831   int i;
1832
1833   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1834   if (BE (err != REG_NOERROR, 0))
1835     return err;
1836
1837   if (!state->inveclosure.alloc)
1838     {
1839       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1840       if (BE (err != REG_NOERROR, 0))
1841         return REG_ESPACE;
1842       for (i = 0; i < dest_nodes->nelem; i++)
1843         {
1844           err = re_node_set_merge (&state->inveclosure,
1845                                    dfa->inveclosures + dest_nodes->elems[i]);
1846           if (BE (err != REG_NOERROR, 0))
1847             return REG_ESPACE;
1848         }
1849     }
1850   return re_node_set_add_intersect (dest_nodes, candidates,
1851                                     &state->inveclosure);
1852 }
1853
1854 static reg_errcode_t
1855 internal_function
1856 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1857                        const re_node_set *candidates)
1858 {
1859     int ecl_idx;
1860     reg_errcode_t err;
1861     re_node_set *inv_eclosure = dfa->inveclosures + node;
1862     re_node_set except_nodes;
1863     re_node_set_init_empty (&except_nodes);
1864     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1865       {
1866         int cur_node = inv_eclosure->elems[ecl_idx];
1867         if (cur_node == node)
1868           continue;
1869         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1870           {
1871             int edst1 = dfa->edests[cur_node].elems[0];
1872             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1873                          ? dfa->edests[cur_node].elems[1] : -1);
1874             if ((!re_node_set_contains (inv_eclosure, edst1)
1875                  && re_node_set_contains (dest_nodes, edst1))
1876                 || (edst2 > 0
1877                     && !re_node_set_contains (inv_eclosure, edst2)
1878                     && re_node_set_contains (dest_nodes, edst2)))
1879               {
1880                 err = re_node_set_add_intersect (&except_nodes, candidates,
1881                                                  dfa->inveclosures + cur_node);
1882                 if (BE (err != REG_NOERROR, 0))
1883                   {
1884                     re_node_set_free (&except_nodes);
1885                     return err;
1886                   }
1887               }
1888           }
1889       }
1890     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1891       {
1892         int cur_node = inv_eclosure->elems[ecl_idx];
1893         if (!re_node_set_contains (&except_nodes, cur_node))
1894           {
1895             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1896             re_node_set_remove_at (dest_nodes, idx);
1897           }
1898       }
1899     re_node_set_free (&except_nodes);
1900     return REG_NOERROR;
1901 }
1902
1903 static int
1904 internal_function
1905 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1906                   int dst_node, int dst_idx, int src_node, int src_idx)
1907 {
1908   const re_dfa_t *const dfa = mctx->dfa;
1909   int lim_idx, src_pos, dst_pos;
1910
1911   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1912   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1913   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1914     {
1915       int subexp_idx;
1916       struct re_backref_cache_entry *ent;
1917       ent = mctx->bkref_ents + limits->elems[lim_idx];
1918       subexp_idx = dfa->nodes[ent->node].opr.idx;
1919
1920       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1921                                            subexp_idx, dst_node, dst_idx,
1922                                            dst_bkref_idx);
1923       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1924                                            subexp_idx, src_node, src_idx,
1925                                            src_bkref_idx);
1926
1927       /* In case of:
1928          <src> <dst> ( <subexp> )
1929          ( <subexp> ) <src> <dst>
1930          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1931       if (src_pos == dst_pos)
1932         continue; /* This is unrelated limitation.  */
1933       else
1934         return 1;
1935     }
1936   return 0;
1937 }
1938
1939 static int
1940 internal_function
1941 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1942                              int subexp_idx, int from_node, int bkref_idx)
1943 {
1944   const re_dfa_t *const dfa = mctx->dfa;
1945   const re_node_set *eclosures = dfa->eclosures + from_node;
1946   int node_idx;
1947
1948   /* Else, we are on the boundary: examine the nodes on the epsilon
1949      closure.  */
1950   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1951     {
1952       int node = eclosures->elems[node_idx];
1953       switch (dfa->nodes[node].type)
1954         {
1955         case OP_BACK_REF:
1956           if (bkref_idx != -1)
1957             {
1958               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1959               do
1960                 {
1961                   int dst, cpos;
1962
1963                   if (ent->node != node)
1964                     continue;
1965
1966                   if (subexp_idx < BITSET_WORD_BITS
1967                       && !(ent->eps_reachable_subexps_map
1968                            & ((bitset_word_t) 1 << subexp_idx)))
1969                     continue;
1970
1971                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1972                      OP_CLOSE_SUBEXP cases below.  But, if the
1973                      destination node is the same node as the source
1974                      node, don't recurse because it would cause an
1975                      infinite loop: a regex that exhibits this behavior
1976                      is ()\1*\1*  */
1977                   dst = dfa->edests[node].elems[0];
1978                   if (dst == from_node)
1979                     {
1980                       if (boundaries & 1)
1981                         return -1;
1982                       else /* if (boundaries & 2) */
1983                         return 0;
1984                     }
1985
1986                   cpos =
1987                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1988                                                  dst, bkref_idx);
1989                   if (cpos == -1 /* && (boundaries & 1) */)
1990                     return -1;
1991                   if (cpos == 0 && (boundaries & 2))
1992                     return 0;
1993
1994                   if (subexp_idx < BITSET_WORD_BITS)
1995                     ent->eps_reachable_subexps_map
1996                       &= ~((bitset_word_t) 1 << subexp_idx);
1997                 }
1998               while (ent++->more);
1999             }
2000           break;
2001
2002         case OP_OPEN_SUBEXP:
2003           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2004             return -1;
2005           break;
2006
2007         case OP_CLOSE_SUBEXP:
2008           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2009             return 0;
2010           break;
2011
2012         default:
2013             break;
2014         }
2015     }
2016
2017   return (boundaries & 2) ? 1 : 0;
2018 }
2019
2020 static int
2021 internal_function
2022 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2023                            int subexp_idx, int from_node, int str_idx,
2024                            int bkref_idx)
2025 {
2026   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2027   int boundaries;
2028
2029   /* If we are outside the range of the subexpression, return -1 or 1.  */
2030   if (str_idx < lim->subexp_from)
2031     return -1;
2032
2033   if (lim->subexp_to < str_idx)
2034     return 1;
2035
2036   /* If we are within the subexpression, return 0.  */
2037   boundaries = (str_idx == lim->subexp_from);
2038   boundaries |= (str_idx == lim->subexp_to) << 1;
2039   if (boundaries == 0)
2040     return 0;
2041
2042   /* Else, examine epsilon closure.  */
2043   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2044                                       from_node, bkref_idx);
2045 }
2046
2047 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2048    which are against limitations from DEST_NODES. */
2049
2050 static reg_errcode_t
2051 internal_function
2052 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2053                      const re_node_set *candidates, re_node_set *limits,
2054                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2055 {
2056   reg_errcode_t err;
2057   int node_idx, lim_idx;
2058
2059   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2060     {
2061       int subexp_idx;
2062       struct re_backref_cache_entry *ent;
2063       ent = bkref_ents + limits->elems[lim_idx];
2064
2065       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2066         continue; /* This is unrelated limitation.  */
2067
2068       subexp_idx = dfa->nodes[ent->node].opr.idx;
2069       if (ent->subexp_to == str_idx)
2070         {
2071           int ops_node = -1;
2072           int cls_node = -1;
2073           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2074             {
2075               int node = dest_nodes->elems[node_idx];
2076               re_token_type_t type = dfa->nodes[node].type;
2077               if (type == OP_OPEN_SUBEXP
2078                   && subexp_idx == dfa->nodes[node].opr.idx)
2079                 ops_node = node;
2080               else if (type == OP_CLOSE_SUBEXP
2081                        && subexp_idx == dfa->nodes[node].opr.idx)
2082                 cls_node = node;
2083             }
2084
2085           /* Check the limitation of the open subexpression.  */
2086           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2087           if (ops_node >= 0)
2088             {
2089               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2090                                            candidates);
2091               if (BE (err != REG_NOERROR, 0))
2092                 return err;
2093             }
2094
2095           /* Check the limitation of the close subexpression.  */
2096           if (cls_node >= 0)
2097             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2098               {
2099                 int node = dest_nodes->elems[node_idx];
2100                 if (!re_node_set_contains (dfa->inveclosures + node,
2101                                            cls_node)
2102                     && !re_node_set_contains (dfa->eclosures + node,
2103                                               cls_node))
2104                   {
2105                     /* It is against this limitation.
2106                        Remove it form the current sifted state.  */
2107                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2108                                                  candidates);
2109                     if (BE (err != REG_NOERROR, 0))
2110                       return err;
2111                     --node_idx;
2112                   }
2113               }
2114         }
2115       else /* (ent->subexp_to != str_idx)  */
2116         {
2117           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2118             {
2119               int node = dest_nodes->elems[node_idx];
2120               re_token_type_t type = dfa->nodes[node].type;
2121               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2122                 {
2123                   if (subexp_idx != dfa->nodes[node].opr.idx)
2124                     continue;
2125                   /* It is against this limitation.
2126                      Remove it form the current sifted state.  */
2127                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2128                                                candidates);
2129                   if (BE (err != REG_NOERROR, 0))
2130                     return err;
2131                 }
2132             }
2133         }
2134     }
2135   return REG_NOERROR;
2136 }
2137
2138 static reg_errcode_t
2139 internal_function
2140 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2141                    int str_idx, const re_node_set *candidates)
2142 {
2143   const re_dfa_t *const dfa = mctx->dfa;
2144   reg_errcode_t err;
2145   int node_idx, node;
2146   re_sift_context_t local_sctx;
2147   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2148
2149   if (first_idx == -1)
2150     return REG_NOERROR;
2151
2152   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2153
2154   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2155     {
2156       int enabled_idx;
2157       re_token_type_t type;
2158       struct re_backref_cache_entry *entry;
2159       node = candidates->elems[node_idx];
2160       type = dfa->nodes[node].type;
2161       /* Avoid infinite loop for the REs like "()\1+".  */
2162       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2163         continue;
2164       if (type != OP_BACK_REF)
2165         continue;
2166
2167       entry = mctx->bkref_ents + first_idx;
2168       enabled_idx = first_idx;
2169       do
2170         {
2171           int subexp_len;
2172           int to_idx;
2173           int dst_node;
2174           int ret;
2175           re_dfastate_t *cur_state;
2176
2177           if (entry->node != node)
2178             continue;
2179           subexp_len = entry->subexp_to - entry->subexp_from;
2180           to_idx = str_idx + subexp_len;
2181           dst_node = (subexp_len ? dfa->nexts[node]
2182                       : dfa->edests[node].elems[0]);
2183
2184           if (to_idx > sctx->last_str_idx
2185               || sctx->sifted_states[to_idx] == NULL
2186               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2187               || check_dst_limits (mctx, &sctx->limits, node,
2188                                    str_idx, dst_node, to_idx))
2189             continue;
2190
2191           if (local_sctx.sifted_states == NULL)
2192             {
2193               local_sctx = *sctx;
2194               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2195               if (BE (err != REG_NOERROR, 0))
2196                 goto free_return;
2197             }
2198           local_sctx.last_node = node;
2199           local_sctx.last_str_idx = str_idx;
2200           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2201           if (BE (ret < 0, 0))
2202             {
2203               err = REG_ESPACE;
2204               goto free_return;
2205             }
2206           cur_state = local_sctx.sifted_states[str_idx];
2207           err = sift_states_backward (mctx, &local_sctx);
2208           if (BE (err != REG_NOERROR, 0))
2209             goto free_return;
2210           if (sctx->limited_states != NULL)
2211             {
2212               err = merge_state_array (dfa, sctx->limited_states,
2213                                        local_sctx.sifted_states,
2214                                        str_idx + 1);
2215               if (BE (err != REG_NOERROR, 0))
2216                 goto free_return;
2217             }
2218           local_sctx.sifted_states[str_idx] = cur_state;
2219           re_node_set_remove (&local_sctx.limits, enabled_idx);
2220
2221           /* mctx->bkref_ents may have changed, reload the pointer.  */
2222           entry = mctx->bkref_ents + enabled_idx;
2223         }
2224       while (enabled_idx++, entry++->more);
2225     }
2226   err = REG_NOERROR;
2227  free_return:
2228   if (local_sctx.sifted_states != NULL)
2229     {
2230       re_node_set_free (&local_sctx.limits);
2231     }
2232
2233   return err;
2234 }
2235
2236
2237 #ifdef RE_ENABLE_I18N
2238 static int
2239 internal_function
2240 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2241                      int node_idx, int str_idx, int max_str_idx)
2242 {
2243   const re_dfa_t *const dfa = mctx->dfa;
2244   int naccepted;
2245   /* Check the node can accept `multi byte'.  */
2246   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2247   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2248       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2249                             dfa->nexts[node_idx]))
2250     /* The node can't accept the `multi byte', or the
2251        destination was already thrown away, then the node
2252        could't accept the current input `multi byte'.   */
2253     naccepted = 0;
2254   /* Otherwise, it is sure that the node could accept
2255      `naccepted' bytes input.  */
2256   return naccepted;
2257 }
2258 #endif /* RE_ENABLE_I18N */
2259
2260 \f
2261 /* Functions for state transition.  */
2262
2263 /* Return the next state to which the current state STATE will transit by
2264    accepting the current input byte, and update STATE_LOG if necessary.
2265    If STATE can accept a multibyte char/collating element/back reference
2266    update the destination of STATE_LOG.  */
2267
2268 static re_dfastate_t *
2269 internal_function
2270 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2271                re_dfastate_t *state)
2272 {
2273   re_dfastate_t **trtable;
2274   unsigned char ch;
2275
2276 #ifdef RE_ENABLE_I18N
2277   /* If the current state can accept multibyte.  */
2278   if (BE (state->accept_mb, 0))
2279     {
2280       *err = transit_state_mb (mctx, state);
2281       if (BE (*err != REG_NOERROR, 0))
2282         return NULL;
2283     }
2284 #endif /* RE_ENABLE_I18N */
2285
2286   /* Then decide the next state with the single byte.  */
2287 #if 0
2288   if (0)
2289     /* don't use transition table  */
2290     return transit_state_sb (err, mctx, state);
2291 #endif
2292
2293   /* Use transition table  */
2294   ch = re_string_fetch_byte (&mctx->input);
2295   for (;;)
2296     {
2297       trtable = state->trtable;
2298       if (BE (trtable != NULL, 1))
2299         return trtable[ch];
2300
2301       trtable = state->word_trtable;
2302       if (BE (trtable != NULL, 1))
2303         {
2304           unsigned int context;
2305           context
2306             = re_string_context_at (&mctx->input,
2307                                     re_string_cur_idx (&mctx->input) - 1,
2308                                     mctx->eflags);
2309           if (IS_WORD_CONTEXT (context))
2310             return trtable[ch + SBC_MAX];
2311           else
2312             return trtable[ch];
2313         }
2314
2315       if (!build_trtable (mctx->dfa, state))
2316         {
2317           *err = REG_ESPACE;
2318           return NULL;
2319         }
2320
2321       /* Retry, we now have a transition table.  */
2322     }
2323 }
2324
2325 /* Update the state_log if we need */
2326 re_dfastate_t *
2327 internal_function
2328 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2329                       re_dfastate_t *next_state)
2330 {
2331   const re_dfa_t *const dfa = mctx->dfa;
2332   int cur_idx = re_string_cur_idx (&mctx->input);
2333
2334   if (cur_idx > mctx->state_log_top)
2335     {
2336       mctx->state_log[cur_idx] = next_state;
2337       mctx->state_log_top = cur_idx;
2338     }
2339   else if (mctx->state_log[cur_idx] == 0)
2340     {
2341       mctx->state_log[cur_idx] = next_state;
2342     }
2343   else
2344     {
2345       re_dfastate_t *pstate;
2346       unsigned int context;
2347       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2348       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2349          the destination of a multibyte char/collating element/
2350          back reference.  Then the next state is the union set of
2351          these destinations and the results of the transition table.  */
2352       pstate = mctx->state_log[cur_idx];
2353       log_nodes = pstate->entrance_nodes;
2354       if (next_state != NULL)
2355         {
2356           table_nodes = next_state->entrance_nodes;
2357           *err = re_node_set_init_union (&next_nodes, table_nodes,
2358                                              log_nodes);
2359           if (BE (*err != REG_NOERROR, 0))
2360             return NULL;
2361         }
2362       else
2363         next_nodes = *log_nodes;
2364       /* Note: We already add the nodes of the initial state,
2365          then we don't need to add them here.  */
2366
2367       context = re_string_context_at (&mctx->input,
2368                                       re_string_cur_idx (&mctx->input) - 1,
2369                                       mctx->eflags);
2370       next_state = mctx->state_log[cur_idx]
2371         = re_acquire_state_context (err, dfa, &next_nodes, context);
2372       /* We don't need to check errors here, since the return value of
2373          this function is next_state and ERR is already set.  */
2374
2375       if (table_nodes != NULL)
2376         re_node_set_free (&next_nodes);
2377     }
2378
2379   if (BE (dfa->nbackref, 0) && next_state != NULL)
2380     {
2381       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2382          later.  We must check them here, since the back references in the
2383          next state might use them.  */
2384       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2385                                         cur_idx);
2386       if (BE (*err != REG_NOERROR, 0))
2387         return NULL;
2388
2389       /* If the next state has back references.  */
2390       if (next_state->has_backref)
2391         {
2392           *err = transit_state_bkref (mctx, &next_state->nodes);
2393           if (BE (*err != REG_NOERROR, 0))
2394             return NULL;
2395           next_state = mctx->state_log[cur_idx];
2396         }
2397     }
2398
2399   return next_state;
2400 }
2401
2402 /* Skip bytes in the input that correspond to part of a
2403    multi-byte match, then look in the log for a state
2404    from which to restart matching.  */
2405 re_dfastate_t *
2406 internal_function
2407 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2408 {
2409   re_dfastate_t *cur_state;
2410   do
2411     {
2412       int max = mctx->state_log_top;
2413       int cur_str_idx = re_string_cur_idx (&mctx->input);
2414
2415       do
2416         {
2417           if (++cur_str_idx > max)
2418             return NULL;
2419           re_string_skip_bytes (&mctx->input, 1);
2420         }
2421       while (mctx->state_log[cur_str_idx] == NULL);
2422
2423       cur_state = merge_state_with_log (err, mctx, NULL);
2424     }
2425   while (*err == REG_NOERROR && cur_state == NULL);
2426   return cur_state;
2427 }
2428
2429 /* Helper functions for transit_state.  */
2430
2431 /* From the node set CUR_NODES, pick up the nodes whose types are
2432    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2433    expression. And register them to use them later for evaluating the
2434    correspoding back references.  */
2435
2436 static reg_errcode_t
2437 internal_function
2438 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2439                            int str_idx)
2440 {
2441   const re_dfa_t *const dfa = mctx->dfa;
2442   int node_idx;
2443   reg_errcode_t err;
2444
2445   /* TODO: This isn't efficient.
2446            Because there might be more than one nodes whose types are
2447            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2448            nodes.
2449            E.g. RE: (a){2}  */
2450   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2451     {
2452       int node = cur_nodes->elems[node_idx];
2453       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2454           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2455           && (dfa->used_bkref_map
2456               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2457         {
2458           err = match_ctx_add_subtop (mctx, node, str_idx);
2459           if (BE (err != REG_NOERROR, 0))
2460             return err;
2461         }
2462     }
2463   return REG_NOERROR;
2464 }
2465
2466 #if 0
2467 /* Return the next state to which the current state STATE will transit by
2468    accepting the current input byte.  */
2469
2470 static re_dfastate_t *
2471 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2472                   re_dfastate_t *state)
2473 {
2474   const re_dfa_t *const dfa = mctx->dfa;
2475   re_node_set next_nodes;
2476   re_dfastate_t *next_state;
2477   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2478   unsigned int context;
2479
2480   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2481   if (BE (*err != REG_NOERROR, 0))
2482     return NULL;
2483   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2484     {
2485       int cur_node = state->nodes.elems[node_cnt];
2486       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2487         {
2488           *err = re_node_set_merge (&next_nodes,
2489                                     dfa->eclosures + dfa->nexts[cur_node]);
2490           if (BE (*err != REG_NOERROR, 0))
2491             {
2492               re_node_set_free (&next_nodes);
2493               return NULL;
2494             }
2495         }
2496     }
2497   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2498   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2499   /* We don't need to check errors here, since the return value of
2500      this function is next_state and ERR is already set.  */
2501
2502   re_node_set_free (&next_nodes);
2503   re_string_skip_bytes (&mctx->input, 1);
2504   return next_state;
2505 }
2506 #endif
2507
2508 #ifdef RE_ENABLE_I18N
2509 static reg_errcode_t
2510 internal_function
2511 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2512 {
2513   const re_dfa_t *const dfa = mctx->dfa;
2514   reg_errcode_t err;
2515   int i;
2516
2517   for (i = 0; i < pstate->nodes.nelem; ++i)
2518     {
2519       re_node_set dest_nodes, *new_nodes;
2520       int cur_node_idx = pstate->nodes.elems[i];
2521       int naccepted, dest_idx;
2522       unsigned int context;
2523       re_dfastate_t *dest_state;
2524
2525       if (!dfa->nodes[cur_node_idx].accept_mb)
2526         continue;
2527
2528       if (dfa->nodes[cur_node_idx].constraint)
2529         {
2530           context = re_string_context_at (&mctx->input,
2531                                           re_string_cur_idx (&mctx->input),
2532                                           mctx->eflags);
2533           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2534                                            context))
2535             continue;
2536         }
2537
2538       /* How many bytes the node can accept?  */
2539       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2540                                            re_string_cur_idx (&mctx->input));
2541       if (naccepted == 0)
2542         continue;
2543
2544       /* The node can accepts `naccepted' bytes.  */
2545       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2546       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2547                                : mctx->max_mb_elem_len);
2548       err = clean_state_log_if_needed (mctx, dest_idx);
2549       if (BE (err != REG_NOERROR, 0))
2550         return err;
2551 #ifdef DEBUG
2552       assert (dfa->nexts[cur_node_idx] != -1);
2553 #endif
2554       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2555
2556       dest_state = mctx->state_log[dest_idx];
2557       if (dest_state == NULL)
2558         dest_nodes = *new_nodes;
2559       else
2560         {
2561           err = re_node_set_init_union (&dest_nodes,
2562                                         dest_state->entrance_nodes, new_nodes);
2563           if (BE (err != REG_NOERROR, 0))
2564             return err;
2565         }
2566       context = re_string_context_at (&mctx->input, dest_idx - 1,
2567                                       mctx->eflags);
2568       mctx->state_log[dest_idx]
2569         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2570       if (dest_state != NULL)
2571         re_node_set_free (&dest_nodes);
2572       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2573         return err;
2574     }
2575   return REG_NOERROR;
2576 }
2577 #endif /* RE_ENABLE_I18N */
2578
2579 static reg_errcode_t
2580 internal_function
2581 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2582 {
2583   const re_dfa_t *const dfa = mctx->dfa;
2584   reg_errcode_t err;
2585   int i;
2586   int cur_str_idx = re_string_cur_idx (&mctx->input);
2587
2588   for (i = 0; i < nodes->nelem; ++i)
2589     {
2590       int dest_str_idx, prev_nelem, bkc_idx;
2591       int node_idx = nodes->elems[i];
2592       unsigned int context;
2593       const re_token_t *node = dfa->nodes + node_idx;
2594       re_node_set *new_dest_nodes;
2595
2596       /* Check whether `node' is a backreference or not.  */
2597       if (node->type != OP_BACK_REF)
2598         continue;
2599
2600       if (node->constraint)
2601         {
2602           context = re_string_context_at (&mctx->input, cur_str_idx,
2603                                           mctx->eflags);
2604           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2605             continue;
2606         }
2607
2608       /* `node' is a backreference.
2609          Check the substring which the substring matched.  */
2610       bkc_idx = mctx->nbkref_ents;
2611       err = get_subexp (mctx, node_idx, cur_str_idx);
2612       if (BE (err != REG_NOERROR, 0))
2613         goto free_return;
2614
2615       /* And add the epsilon closures (which is `new_dest_nodes') of
2616          the backreference to appropriate state_log.  */
2617 #ifdef DEBUG
2618       assert (dfa->nexts[node_idx] != -1);
2619 #endif
2620       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2621         {
2622           int subexp_len;
2623           re_dfastate_t *dest_state;
2624           struct re_backref_cache_entry *bkref_ent;
2625           bkref_ent = mctx->bkref_ents + bkc_idx;
2626           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2627             continue;
2628           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2629           new_dest_nodes = (subexp_len == 0
2630                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2631                             : dfa->eclosures + dfa->nexts[node_idx]);
2632           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2633                           - bkref_ent->subexp_from);
2634           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2635                                           mctx->eflags);
2636           dest_state = mctx->state_log[dest_str_idx];
2637           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2638                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2639           /* Add `new_dest_node' to state_log.  */
2640           if (dest_state == NULL)
2641             {
2642               mctx->state_log[dest_str_idx]
2643                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2644                                             context);
2645               if (BE (mctx->state_log[dest_str_idx] == NULL
2646                       && err != REG_NOERROR, 0))
2647                 goto free_return;
2648             }
2649           else
2650             {
2651               re_node_set dest_nodes;
2652               err = re_node_set_init_union (&dest_nodes,
2653                                             dest_state->entrance_nodes,
2654                                             new_dest_nodes);
2655               if (BE (err != REG_NOERROR, 0))
2656                 {
2657                   re_node_set_free (&dest_nodes);
2658                   goto free_return;
2659                 }
2660               mctx->state_log[dest_str_idx]
2661                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2662               re_node_set_free (&dest_nodes);
2663               if (BE (mctx->state_log[dest_str_idx] == NULL
2664                       && err != REG_NOERROR, 0))
2665                 goto free_return;
2666             }
2667           /* We need to check recursively if the backreference can epsilon
2668              transit.  */
2669           if (subexp_len == 0
2670               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2671             {
2672               err = check_subexp_matching_top (mctx, new_dest_nodes,
2673                                                cur_str_idx);
2674               if (BE (err != REG_NOERROR, 0))
2675                 goto free_return;
2676               err = transit_state_bkref (mctx, new_dest_nodes);
2677               if (BE (err != REG_NOERROR, 0))
2678                 goto free_return;
2679             }
2680         }
2681     }
2682   err = REG_NOERROR;
2683  free_return:
2684   return err;
2685 }
2686
2687 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2688    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2689    Note that we might collect inappropriate candidates here.
2690    However, the cost of checking them strictly here is too high, then we
2691    delay these checking for prune_impossible_nodes().  */
2692
2693 static reg_errcode_t
2694 internal_function
2695 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2696 {
2697   const re_dfa_t *const dfa = mctx->dfa;
2698   int subexp_num, sub_top_idx;
2699   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2700   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2701   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2702   if (cache_idx != -1)
2703     {
2704       const struct re_backref_cache_entry *entry
2705         = mctx->bkref_ents + cache_idx;
2706       do
2707         if (entry->node == bkref_node)
2708           return REG_NOERROR; /* We already checked it.  */
2709       while (entry++->more);
2710     }
2711
2712   subexp_num = dfa->nodes[bkref_node].opr.idx;
2713
2714   /* For each sub expression  */
2715   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2716     {
2717       reg_errcode_t err;
2718       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2719       re_sub_match_last_t *sub_last;
2720       int sub_last_idx, sl_str, bkref_str_off;
2721
2722       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2723         continue; /* It isn't related.  */
2724
2725       sl_str = sub_top->str_idx;
2726       bkref_str_off = bkref_str_idx;
2727       /* At first, check the last node of sub expressions we already
2728          evaluated.  */
2729       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2730         {
2731           int sl_str_diff;
2732           sub_last = sub_top->lasts[sub_last_idx];
2733           sl_str_diff = sub_last->str_idx - sl_str;
2734           /* The matched string by the sub expression match with the substring
2735              at the back reference?  */
2736           if (sl_str_diff > 0)
2737             {
2738               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2739                 {
2740                   /* Not enough chars for a successful match.  */
2741                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2742                     break;
2743
2744                   err = clean_state_log_if_needed (mctx,
2745                                                    bkref_str_off
2746                                                    + sl_str_diff);
2747                   if (BE (err != REG_NOERROR, 0))
2748                     return err;
2749                   buf = (const char *) re_string_get_buffer (&mctx->input);
2750                 }
2751               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2752                 /* We don't need to search this sub expression any more.  */
2753                 break;
2754             }
2755           bkref_str_off += sl_str_diff;
2756           sl_str += sl_str_diff;
2757           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2758                                 bkref_str_idx);
2759
2760           /* Reload buf, since the preceding call might have reallocated
2761              the buffer.  */
2762           buf = (const char *) re_string_get_buffer (&mctx->input);
2763
2764           if (err == REG_NOMATCH)
2765             continue;
2766           if (BE (err != REG_NOERROR, 0))
2767             return err;
2768         }
2769
2770       if (sub_last_idx < sub_top->nlasts)
2771         continue;
2772       if (sub_last_idx > 0)
2773         ++sl_str;
2774       /* Then, search for the other last nodes of the sub expression.  */
2775       for (; sl_str <= bkref_str_idx; ++sl_str)
2776         {
2777           int cls_node, sl_str_off;
2778           const re_node_set *nodes;
2779           sl_str_off = sl_str - sub_top->str_idx;
2780           /* The matched string by the sub expression match with the substring
2781              at the back reference?  */
2782           if (sl_str_off > 0)
2783             {
2784               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2785                 {
2786                   /* If we are at the end of the input, we cannot match.  */
2787                   if (bkref_str_off >= mctx->input.len)
2788                     break;
2789
2790                   err = extend_buffers (mctx);
2791                   if (BE (err != REG_NOERROR, 0))
2792                     return err;
2793
2794                   buf = (const char *) re_string_get_buffer (&mctx->input);
2795                 }
2796               if (buf [bkref_str_off++] != buf[sl_str - 1])
2797                 break; /* We don't need to search this sub expression
2798                           any more.  */
2799             }
2800           if (mctx->state_log[sl_str] == NULL)
2801             continue;
2802           /* Does this state have a ')' of the sub expression?  */
2803           nodes = &mctx->state_log[sl_str]->nodes;
2804           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2805                                        OP_CLOSE_SUBEXP);
2806           if (cls_node == -1)
2807             continue; /* No.  */
2808           if (sub_top->path == NULL)
2809             {
2810               sub_top->path = calloc (sizeof (state_array_t),
2811                                       sl_str - sub_top->str_idx + 1);
2812               if (sub_top->path == NULL)
2813                 return REG_ESPACE;
2814             }
2815           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2816              in the current context?  */
2817           err = check_arrival (mctx, sub_top->path, sub_top->node,
2818                                sub_top->str_idx, cls_node, sl_str,
2819                                OP_CLOSE_SUBEXP);
2820           if (err == REG_NOMATCH)
2821               continue;
2822           if (BE (err != REG_NOERROR, 0))
2823               return err;
2824           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2825           if (BE (sub_last == NULL, 0))
2826             return REG_ESPACE;
2827           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2828                                 bkref_str_idx);
2829           if (err == REG_NOMATCH)
2830             continue;
2831         }
2832     }
2833   return REG_NOERROR;
2834 }
2835
2836 /* Helper functions for get_subexp().  */
2837
2838 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2839    If it can arrive, register the sub expression expressed with SUB_TOP
2840    and SUB_LAST.  */
2841
2842 static reg_errcode_t
2843 internal_function
2844 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2845                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2846 {
2847   reg_errcode_t err;
2848   int to_idx;
2849   /* Can the subexpression arrive the back reference?  */
2850   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2851                        sub_last->str_idx, bkref_node, bkref_str,
2852                        OP_OPEN_SUBEXP);
2853   if (err != REG_NOERROR)
2854     return err;
2855   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2856                              sub_last->str_idx);
2857   if (BE (err != REG_NOERROR, 0))
2858     return err;
2859   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2860   return clean_state_log_if_needed (mctx, to_idx);
2861 }
2862
2863 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2864    Search '(' if FL_OPEN, or search ')' otherwise.
2865    TODO: This function isn't efficient...
2866          Because there might be more than one nodes whose types are
2867          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2868          nodes.
2869          E.g. RE: (a){2}  */
2870
2871 static int
2872 internal_function
2873 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2874                   int subexp_idx, int type)
2875 {
2876   int cls_idx;
2877   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2878     {
2879       int cls_node = nodes->elems[cls_idx];
2880       const re_token_t *node = dfa->nodes + cls_node;
2881       if (node->type == type
2882           && node->opr.idx == subexp_idx)
2883         return cls_node;
2884     }
2885   return -1;
2886 }
2887
2888 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2889    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2890    heavily reused.
2891    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2892
2893 static reg_errcode_t
2894 internal_function
2895 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2896                int top_str, int last_node, int last_str, int type)
2897 {
2898   const re_dfa_t *const dfa = mctx->dfa;
2899   reg_errcode_t err = REG_NOERROR;
2900   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2901   re_dfastate_t *cur_state = NULL;
2902   re_node_set *cur_nodes, next_nodes;
2903   re_dfastate_t **backup_state_log;
2904   unsigned int context;
2905
2906   subexp_num = dfa->nodes[top_node].opr.idx;
2907   /* Extend the buffer if we need.  */
2908   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2909     {
2910       re_dfastate_t **new_array;
2911       int old_alloc = path->alloc;
2912       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2913       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2914       if (BE (new_array == NULL, 0))
2915         {
2916           path->alloc = old_alloc;
2917           return REG_ESPACE;
2918         }
2919       path->array = new_array;
2920       memset (new_array + old_alloc, '\0',
2921               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2922     }
2923
2924   str_idx = path->next_idx ? path->next_idx : top_str;
2925
2926   /* Temporary modify MCTX.  */
2927   backup_state_log = mctx->state_log;
2928   backup_cur_idx = mctx->input.cur_idx;
2929   mctx->state_log = path->array;
2930   mctx->input.cur_idx = str_idx;
2931
2932   /* Setup initial node set.  */
2933   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2934   if (str_idx == top_str)
2935     {
2936       err = re_node_set_init_1 (&next_nodes, top_node);
2937       if (BE (err != REG_NOERROR, 0))
2938         return err;
2939       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2940       if (BE (err != REG_NOERROR, 0))
2941         {
2942           re_node_set_free (&next_nodes);
2943           return err;
2944         }
2945     }
2946   else
2947     {
2948       cur_state = mctx->state_log[str_idx];
2949       if (cur_state && cur_state->has_backref)
2950         {
2951           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2952           if (BE (err != REG_NOERROR, 0))
2953             return err;
2954         }
2955       else
2956         re_node_set_init_empty (&next_nodes);
2957     }
2958   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2959     {
2960       if (next_nodes.nelem)
2961         {
2962           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2963                                     subexp_num, type);
2964           if (BE (err != REG_NOERROR, 0))
2965             {
2966               re_node_set_free (&next_nodes);
2967               return err;
2968             }
2969         }
2970       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2971       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2972         {
2973           re_node_set_free (&next_nodes);
2974           return err;
2975         }
2976       mctx->state_log[str_idx] = cur_state;
2977     }
2978
2979   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2980     {
2981       re_node_set_empty (&next_nodes);
2982       if (mctx->state_log[str_idx + 1])
2983         {
2984           err = re_node_set_merge (&next_nodes,
2985                                    &mctx->state_log[str_idx + 1]->nodes);
2986           if (BE (err != REG_NOERROR, 0))
2987             {
2988               re_node_set_free (&next_nodes);
2989               return err;
2990             }
2991         }
2992       if (cur_state)
2993         {
2994           err = check_arrival_add_next_nodes (mctx, str_idx,
2995                                               &cur_state->non_eps_nodes,
2996                                               &next_nodes);
2997           if (BE (err != REG_NOERROR, 0))
2998             {
2999               re_node_set_free (&next_nodes);
3000               return err;
3001             }
3002         }
3003       ++str_idx;
3004       if (next_nodes.nelem)
3005         {
3006           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3007           if (BE (err != REG_NOERROR, 0))
3008             {
3009               re_node_set_free (&next_nodes);
3010               return err;
3011             }
3012           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3013                                     subexp_num, type);
3014           if (BE (err != REG_NOERROR, 0))
3015             {
3016               re_node_set_free (&next_nodes);
3017               return err;
3018             }
3019         }
3020       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3021       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3022       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3023         {
3024           re_node_set_free (&next_nodes);
3025           return err;
3026         }
3027       mctx->state_log[str_idx] = cur_state;
3028       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3029     }
3030   re_node_set_free (&next_nodes);
3031   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3032                : &mctx->state_log[last_str]->nodes);
3033   path->next_idx = str_idx;
3034
3035   /* Fix MCTX.  */
3036   mctx->state_log = backup_state_log;
3037   mctx->input.cur_idx = backup_cur_idx;
3038
3039   /* Then check the current node set has the node LAST_NODE.  */
3040   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3041     return REG_NOERROR;
3042
3043   return REG_NOMATCH;
3044 }
3045
3046 /* Helper functions for check_arrival.  */
3047
3048 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3049    to NEXT_NODES.
3050    TODO: This function is similar to the functions transit_state*(),
3051          however this function has many additional works.
3052          Can't we unify them?  */
3053
3054 static reg_errcode_t
3055 internal_function
3056 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3057                               re_node_set *cur_nodes, re_node_set *next_nodes)
3058 {
3059   const re_dfa_t *const dfa = mctx->dfa;
3060   int result;
3061   int cur_idx;
3062 #ifdef RE_ENABLE_I18N
3063   reg_errcode_t err = REG_NOERROR;
3064 #endif
3065   re_node_set union_set;
3066   re_node_set_init_empty (&union_set);
3067   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3068     {
3069       int naccepted = 0;
3070       int cur_node = cur_nodes->elems[cur_idx];
3071 #ifdef DEBUG
3072       re_token_type_t type = dfa->nodes[cur_node].type;
3073       assert (!IS_EPSILON_NODE (type));
3074 #endif
3075 #ifdef RE_ENABLE_I18N
3076       /* If the node may accept `multi byte'.  */
3077       if (dfa->nodes[cur_node].accept_mb)
3078         {
3079           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3080                                                str_idx);
3081           if (naccepted > 1)
3082             {
3083               re_dfastate_t *dest_state;
3084               int next_node = dfa->nexts[cur_node];
3085               int next_idx = str_idx + naccepted;
3086               dest_state = mctx->state_log[next_idx];
3087               re_node_set_empty (&union_set);
3088               if (dest_state)
3089                 {
3090                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3091                   if (BE (err != REG_NOERROR, 0))
3092                     {
3093                       re_node_set_free (&union_set);
3094                       return err;
3095                     }
3096                 }
3097               result = re_node_set_insert (&union_set, next_node);
3098               if (BE (result < 0, 0))
3099                 {
3100                   re_node_set_free (&union_set);
3101                   return REG_ESPACE;
3102                 }
3103               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3104                                                             &union_set);
3105               if (BE (mctx->state_log[next_idx] == NULL
3106                       && err != REG_NOERROR, 0))
3107                 {
3108                   re_node_set_free (&union_set);
3109                   return err;
3110                 }
3111             }
3112         }
3113 #endif /* RE_ENABLE_I18N */
3114       if (naccepted
3115           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3116         {
3117           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3118           if (BE (result < 0, 0))
3119             {
3120               re_node_set_free (&union_set);
3121               return REG_ESPACE;
3122             }
3123         }
3124     }
3125   re_node_set_free (&union_set);
3126   return REG_NOERROR;
3127 }
3128
3129 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3130    CUR_NODES, however exclude the nodes which are:
3131     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3132     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3133 */
3134
3135 static reg_errcode_t
3136 internal_function
3137 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3138                           int ex_subexp, int type)
3139 {
3140   reg_errcode_t err;
3141   int idx, outside_node;
3142   re_node_set new_nodes;
3143 #ifdef DEBUG
3144   assert (cur_nodes->nelem);
3145 #endif
3146   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3147   if (BE (err != REG_NOERROR, 0))
3148     return err;
3149   /* Create a new node set NEW_NODES with the nodes which are epsilon
3150      closures of the node in CUR_NODES.  */
3151
3152   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3153     {
3154       int cur_node = cur_nodes->elems[idx];
3155       const re_node_set *eclosure = dfa->eclosures + cur_node;
3156       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3157       if (outside_node == -1)
3158         {
3159           /* There are no problematic nodes, just merge them.  */
3160           err = re_node_set_merge (&new_nodes, eclosure);
3161           if (BE (err != REG_NOERROR, 0))
3162             {
3163               re_node_set_free (&new_nodes);
3164               return err;
3165             }
3166         }
3167       else
3168         {
3169           /* There are problematic nodes, re-calculate incrementally.  */
3170           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3171                                               ex_subexp, type);
3172           if (BE (err != REG_NOERROR, 0))
3173             {
3174               re_node_set_free (&new_nodes);
3175               return err;
3176             }
3177         }
3178     }
3179   re_node_set_free (cur_nodes);
3180   *cur_nodes = new_nodes;
3181   return REG_NOERROR;
3182 }
3183
3184 /* Helper function for check_arrival_expand_ecl.
3185    Check incrementally the epsilon closure of TARGET, and if it isn't
3186    problematic append it to DST_NODES.  */
3187
3188 static reg_errcode_t
3189 internal_function
3190 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3191                               int target, int ex_subexp, int type)
3192 {
3193   int cur_node;
3194   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3195     {
3196       int err;
3197
3198       if (dfa->nodes[cur_node].type == type
3199           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3200         {
3201           if (type == OP_CLOSE_SUBEXP)
3202             {
3203               err = re_node_set_insert (dst_nodes, cur_node);
3204               if (BE (err == -1, 0))
3205                 return REG_ESPACE;
3206             }
3207           break;
3208         }
3209       err = re_node_set_insert (dst_nodes, cur_node);
3210       if (BE (err == -1, 0))
3211         return REG_ESPACE;
3212       if (dfa->edests[cur_node].nelem == 0)
3213         break;
3214       if (dfa->edests[cur_node].nelem == 2)
3215         {
3216           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3217                                               dfa->edests[cur_node].elems[1],
3218                                               ex_subexp, type);
3219           if (BE (err != REG_NOERROR, 0))
3220             return err;
3221         }
3222       cur_node = dfa->edests[cur_node].elems[0];
3223     }
3224   return REG_NOERROR;
3225 }
3226
3227
3228 /* For all the back references in the current state, calculate the
3229    destination of the back references by the appropriate entry
3230    in MCTX->BKREF_ENTS.  */
3231
3232 static reg_errcode_t
3233 internal_function
3234 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3235                     int cur_str, int subexp_num, int type)
3236 {
3237   const re_dfa_t *const dfa = mctx->dfa;
3238   reg_errcode_t err;
3239   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3240   struct re_backref_cache_entry *ent;
3241
3242   if (cache_idx_start == -1)
3243     return REG_NOERROR;
3244
3245  restart:
3246   ent = mctx->bkref_ents + cache_idx_start;
3247   do
3248     {
3249       int to_idx, next_node;
3250
3251       /* Is this entry ENT is appropriate?  */
3252       if (!re_node_set_contains (cur_nodes, ent->node))
3253         continue; /* No.  */
3254
3255       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3256       /* Calculate the destination of the back reference, and append it
3257          to MCTX->STATE_LOG.  */
3258       if (to_idx == cur_str)
3259         {
3260           /* The backreference did epsilon transit, we must re-check all the
3261              node in the current state.  */
3262           re_node_set new_dests;
3263           reg_errcode_t err2, err3;
3264           next_node = dfa->edests[ent->node].elems[0];
3265           if (re_node_set_contains (cur_nodes, next_node))
3266             continue;
3267           err = re_node_set_init_1 (&new_dests, next_node);
3268           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3269           err3 = re_node_set_merge (cur_nodes, &new_dests);
3270           re_node_set_free (&new_dests);
3271           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3272                   || err3 != REG_NOERROR, 0))
3273             {
3274               err = (err != REG_NOERROR ? err
3275                      : (err2 != REG_NOERROR ? err2 : err3));
3276               return err;
3277             }
3278           /* TODO: It is still inefficient...  */
3279           goto restart;
3280         }
3281       else
3282         {
3283           re_node_set union_set;
3284           next_node = dfa->nexts[ent->node];
3285           if (mctx->state_log[to_idx])
3286             {
3287               int ret;
3288               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3289                                         next_node))
3290                 continue;
3291               err = re_node_set_init_copy (&union_set,
3292                                            &mctx->state_log[to_idx]->nodes);
3293               ret = re_node_set_insert (&union_set, next_node);
3294               if (BE (err != REG_NOERROR || ret < 0, 0))
3295                 {
3296                   re_node_set_free (&union_set);
3297                   err = err != REG_NOERROR ? err : REG_ESPACE;
3298                   return err;
3299                 }
3300             }
3301           else
3302             {
3303               err = re_node_set_init_1 (&union_set, next_node);
3304               if (BE (err != REG_NOERROR, 0))
3305                 return err;
3306             }
3307           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3308           re_node_set_free (&union_set);
3309           if (BE (mctx->state_log[to_idx] == NULL
3310                   && err != REG_NOERROR, 0))
3311             return err;
3312         }
3313     }
3314   while (ent++->more);
3315   return REG_NOERROR;
3316 }
3317
3318 /* Build transition table for the state.
3319    Return 1 if succeeded, otherwise return NULL.  */
3320
3321 static int
3322 internal_function
3323 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3324 {
3325   reg_errcode_t err;
3326   int i, j, ch, need_word_trtable = 0;
3327   bitset_word_t elem, mask;
3328   bool dests_node_malloced = false;
3329   bool dest_states_malloced = false;
3330   int ndests; /* Number of the destination states from `state'.  */
3331   re_dfastate_t **trtable;
3332   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3333   re_node_set follows, *dests_node;
3334   bitset_t *dests_ch;
3335   bitset_t acceptable;
3336
3337   struct dests_alloc
3338   {
3339     re_node_set dests_node[SBC_MAX];
3340     bitset_t dests_ch[SBC_MAX];
3341   } *dests_alloc;
3342
3343   /* We build DFA states which corresponds to the destination nodes
3344      from `state'.  `dests_node[i]' represents the nodes which i-th
3345      destination state contains, and `dests_ch[i]' represents the
3346      characters which i-th destination state accepts.  */
3347 #ifdef HAVE_ALLOCA
3348   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3349     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3350   else
3351 #endif
3352     {
3353       dests_alloc = re_malloc (struct dests_alloc, 1);
3354       if (BE (dests_alloc == NULL, 0))
3355         return 0;
3356       dests_node_malloced = true;
3357     }
3358   dests_node = dests_alloc->dests_node;
3359   dests_ch = dests_alloc->dests_ch;
3360
3361   /* Initialize transiton table.  */
3362   state->word_trtable = state->trtable = NULL;
3363
3364   /* At first, group all nodes belonging to `state' into several
3365      destinations.  */
3366   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3367   if (BE (ndests <= 0, 0))
3368     {
3369       if (dests_node_malloced)
3370         free (dests_alloc);
3371       /* Return 0 in case of an error, 1 otherwise.  */
3372       if (ndests == 0)
3373         {
3374           state->trtable = (re_dfastate_t **)
3375             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3376           return 1;
3377         }
3378       return 0;
3379     }
3380
3381   err = re_node_set_alloc (&follows, ndests + 1);
3382   if (BE (err != REG_NOERROR, 0))
3383     goto out_free;
3384
3385   /* Avoid arithmetic overflow in size calculation.  */
3386   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3387             / (3 * sizeof (re_dfastate_t *)))
3388            < ndests),
3389           0))
3390     goto out_free;
3391
3392 #ifdef HAVE_ALLOCA
3393   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3394                          + ndests * 3 * sizeof (re_dfastate_t *)))
3395     dest_states = (re_dfastate_t **)
3396       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3397   else
3398 #endif
3399     {
3400       dest_states = (re_dfastate_t **)
3401         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3402       if (BE (dest_states == NULL, 0))
3403         {
3404 out_free:
3405           if (dest_states_malloced)
3406             free (dest_states);
3407           re_node_set_free (&follows);
3408           for (i = 0; i < ndests; ++i)
3409             re_node_set_free (dests_node + i);
3410           if (dests_node_malloced)
3411             free (dests_alloc);
3412           return 0;
3413         }
3414       dest_states_malloced = true;
3415     }
3416   dest_states_word = dest_states + ndests;
3417   dest_states_nl = dest_states_word + ndests;
3418   bitset_empty (acceptable);
3419
3420   /* Then build the states for all destinations.  */
3421   for (i = 0; i < ndests; ++i)
3422     {
3423       int next_node;
3424       re_node_set_empty (&follows);
3425       /* Merge the follows of this destination states.  */
3426       for (j = 0; j < dests_node[i].nelem; ++j)
3427         {
3428           next_node = dfa->nexts[dests_node[i].elems[j]];
3429           if (next_node != -1)
3430             {
3431               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3432               if (BE (err != REG_NOERROR, 0))
3433                 goto out_free;
3434             }
3435         }
3436       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3437       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3438         goto out_free;
3439       /* If the new state has context constraint,
3440          build appropriate states for these contexts.  */
3441       if (dest_states[i]->has_constraint)
3442         {
3443           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3444                                                           CONTEXT_WORD);
3445           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3446             goto out_free;
3447
3448           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3449             need_word_trtable = 1;
3450
3451           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3452                                                         CONTEXT_NEWLINE);
3453           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3454             goto out_free;
3455         }
3456       else
3457         {
3458           dest_states_word[i] = dest_states[i];
3459           dest_states_nl[i] = dest_states[i];
3460         }
3461       bitset_merge (acceptable, dests_ch[i]);
3462     }
3463
3464   if (!BE (need_word_trtable, 0))
3465     {
3466       /* We don't care about whether the following character is a word
3467          character, or we are in a single-byte character set so we can
3468          discern by looking at the character code: allocate a
3469          256-entry transition table.  */
3470       trtable = state->trtable =
3471         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3472       if (BE (trtable == NULL, 0))
3473         goto out_free;
3474
3475       /* For all characters ch...:  */
3476       for (i = 0; i < BITSET_WORDS; ++i)
3477         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3478              elem;
3479              mask <<= 1, elem >>= 1, ++ch)
3480           if (BE (elem & 1, 0))
3481             {
3482               /* There must be exactly one destination which accepts
3483                  character ch.  See group_nodes_into_DFAstates.  */
3484               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3485                 ;
3486
3487               /* j-th destination accepts the word character ch.  */
3488               if (dfa->word_char[i] & mask)
3489                 trtable[ch] = dest_states_word[j];
3490               else
3491                 trtable[ch] = dest_states[j];
3492             }
3493     }
3494   else
3495     {
3496       /* We care about whether the following character is a word
3497          character, and we are in a multi-byte character set: discern
3498          by looking at the character code: build two 256-entry
3499          transition tables, one starting at trtable[0] and one
3500          starting at trtable[SBC_MAX].  */
3501       trtable = state->word_trtable =
3502         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3503       if (BE (trtable == NULL, 0))
3504         goto out_free;
3505
3506       /* For all characters ch...:  */
3507       for (i = 0; i < BITSET_WORDS; ++i)
3508         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3509              elem;
3510              mask <<= 1, elem >>= 1, ++ch)
3511           if (BE (elem & 1, 0))
3512             {
3513               /* There must be exactly one destination which accepts
3514                  character ch.  See group_nodes_into_DFAstates.  */
3515               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3516                 ;
3517
3518               /* j-th destination accepts the word character ch.  */
3519               trtable[ch] = dest_states[j];
3520               trtable[ch + SBC_MAX] = dest_states_word[j];
3521             }
3522     }
3523
3524   /* new line */
3525   if (bitset_contain (acceptable, NEWLINE_CHAR))
3526     {
3527       /* The current state accepts newline character.  */
3528       for (j = 0; j < ndests; ++j)
3529         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3530           {
3531             /* k-th destination accepts newline character.  */
3532             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3533             if (need_word_trtable)
3534               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3535             /* There must be only one destination which accepts
3536                newline.  See group_nodes_into_DFAstates.  */
3537             break;
3538           }
3539     }
3540
3541   if (dest_states_malloced)
3542     free (dest_states);
3543
3544   re_node_set_free (&follows);
3545   for (i = 0; i < ndests; ++i)
3546     re_node_set_free (dests_node + i);
3547
3548   if (dests_node_malloced)
3549     free (dests_alloc);
3550
3551   return 1;
3552 }
3553
3554 /* Group all nodes belonging to STATE into several destinations.
3555    Then for all destinations, set the nodes belonging to the destination
3556    to DESTS_NODE[i] and set the characters accepted by the destination
3557    to DEST_CH[i].  This function return the number of destinations.  */
3558
3559 static int
3560 internal_function
3561 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3562                             re_node_set *dests_node, bitset_t *dests_ch)
3563 {
3564   reg_errcode_t err;
3565   int result;
3566   int i, j, k;
3567   int ndests; /* Number of the destinations from `state'.  */
3568   bitset_t accepts; /* Characters a node can accept.  */
3569   const re_node_set *cur_nodes = &state->nodes;
3570   bitset_empty (accepts);
3571   ndests = 0;
3572
3573   /* For all the nodes belonging to `state',  */
3574   for (i = 0; i < cur_nodes->nelem; ++i)
3575     {
3576       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3577       re_token_type_t type = node->type;
3578       unsigned int constraint = node->constraint;
3579
3580       /* Enumerate all single byte character this node can accept.  */
3581       if (type == CHARACTER)
3582         bitset_set (accepts, node->opr.c);
3583       else if (type == SIMPLE_BRACKET)
3584         {
3585           bitset_merge (accepts, node->opr.sbcset);
3586         }
3587       else if (type == OP_PERIOD)
3588         {
3589 #ifdef RE_ENABLE_I18N
3590           if (dfa->mb_cur_max > 1)
3591             bitset_merge (accepts, dfa->sb_char);
3592           else
3593 #endif
3594             bitset_set_all (accepts);
3595           if (!(dfa->syntax & RE_DOT_NEWLINE))
3596             bitset_clear (accepts, '\n');
3597           if (dfa->syntax & RE_DOT_NOT_NULL)
3598             bitset_clear (accepts, '\0');
3599         }
3600 #ifdef RE_ENABLE_I18N
3601       else if (type == OP_UTF8_PERIOD)
3602         {
3603           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3604           if (!(dfa->syntax & RE_DOT_NEWLINE))
3605             bitset_clear (accepts, '\n');
3606           if (dfa->syntax & RE_DOT_NOT_NULL)
3607             bitset_clear (accepts, '\0');
3608         }
3609 #endif
3610       else
3611         continue;
3612
3613       /* Check the `accepts' and sift the characters which are not
3614          match it the context.  */
3615       if (constraint)
3616         {
3617           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3618             {
3619               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3620               bitset_empty (accepts);
3621               if (accepts_newline)
3622                 bitset_set (accepts, NEWLINE_CHAR);
3623               else
3624                 continue;
3625             }
3626           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3627             {
3628               bitset_empty (accepts);
3629               continue;
3630             }
3631
3632           if (constraint & NEXT_WORD_CONSTRAINT)
3633             {
3634               bitset_word_t any_set = 0;
3635               if (type == CHARACTER && !node->word_char)
3636                 {
3637                   bitset_empty (accepts);
3638                   continue;
3639                 }
3640 #ifdef RE_ENABLE_I18N
3641               if (dfa->mb_cur_max > 1)
3642                 for (j = 0; j < BITSET_WORDS; ++j)
3643                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3644               else
3645 #endif
3646                 for (j = 0; j < BITSET_WORDS; ++j)
3647                   any_set |= (accepts[j] &= dfa->word_char[j]);
3648               if (!any_set)
3649                 continue;
3650             }
3651           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3652             {
3653               bitset_word_t any_set = 0;
3654               if (type == CHARACTER && node->word_char)
3655                 {
3656                   bitset_empty (accepts);
3657                   continue;
3658                 }
3659 #ifdef RE_ENABLE_I18N
3660               if (dfa->mb_cur_max > 1)
3661                 for (j = 0; j < BITSET_WORDS; ++j)
3662                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3663               else
3664 #endif
3665                 for (j = 0; j < BITSET_WORDS; ++j)
3666                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3667               if (!any_set)
3668                 continue;
3669             }
3670         }
3671
3672       /* Then divide `accepts' into DFA states, or create a new
3673          state.  Above, we make sure that accepts is not empty.  */
3674       for (j = 0; j < ndests; ++j)
3675         {
3676           bitset_t intersec; /* Intersection sets, see below.  */
3677           bitset_t remains;
3678           /* Flags, see below.  */
3679           bitset_word_t has_intersec, not_subset, not_consumed;
3680
3681           /* Optimization, skip if this state doesn't accept the character.  */
3682           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3683             continue;
3684
3685           /* Enumerate the intersection set of this state and `accepts'.  */
3686           has_intersec = 0;
3687           for (k = 0; k < BITSET_WORDS; ++k)
3688             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3689           /* And skip if the intersection set is empty.  */
3690           if (!has_intersec)
3691             continue;
3692
3693           /* Then check if this state is a subset of `accepts'.  */
3694           not_subset = not_consumed = 0;
3695           for (k = 0; k < BITSET_WORDS; ++k)
3696             {
3697               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3698               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3699             }
3700
3701           /* If this state isn't a subset of `accepts', create a
3702              new group state, which has the `remains'. */
3703           if (not_subset)
3704             {
3705               bitset_copy (dests_ch[ndests], remains);
3706               bitset_copy (dests_ch[j], intersec);
3707               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3708               if (BE (err != REG_NOERROR, 0))
3709                 goto error_return;
3710               ++ndests;
3711             }
3712
3713           /* Put the position in the current group. */
3714           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3715           if (BE (result < 0, 0))
3716             goto error_return;
3717
3718           /* If all characters are consumed, go to next node. */
3719           if (!not_consumed)
3720             break;
3721         }
3722       /* Some characters remain, create a new group. */
3723       if (j == ndests)
3724         {
3725           bitset_copy (dests_ch[ndests], accepts);
3726           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3727           if (BE (err != REG_NOERROR, 0))
3728             goto error_return;
3729           ++ndests;
3730           bitset_empty (accepts);
3731         }
3732     }
3733   return ndests;
3734  error_return:
3735   for (j = 0; j < ndests; ++j)
3736     re_node_set_free (dests_node + j);
3737   return -1;
3738 }
3739
3740 #ifdef RE_ENABLE_I18N
3741 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3742    Return the number of the bytes the node accepts.
3743    STR_IDX is the current index of the input string.
3744
3745    This function handles the nodes which can accept one character, or
3746    one collating element like '.', '[a-z]', opposite to the other nodes
3747    can only accept one byte.  */
3748
3749 static int
3750 internal_function
3751 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3752                          const re_string_t *input, int str_idx)
3753 {
3754   const re_token_t *node = dfa->nodes + node_idx;
3755   int char_len, elem_len;
3756   int i;
3757   wint_t wc;
3758
3759   if (BE (node->type == OP_UTF8_PERIOD, 0))
3760     {
3761       unsigned char c = re_string_byte_at (input, str_idx), d;
3762       if (BE (c < 0xc2, 1))
3763         return 0;
3764
3765       if (str_idx + 2 > input->len)
3766         return 0;
3767
3768       d = re_string_byte_at (input, str_idx + 1);
3769       if (c < 0xe0)
3770         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3771       else if (c < 0xf0)
3772         {
3773           char_len = 3;
3774           if (c == 0xe0 && d < 0xa0)
3775             return 0;
3776         }
3777       else if (c < 0xf8)
3778         {
3779           char_len = 4;
3780           if (c == 0xf0 && d < 0x90)
3781             return 0;
3782         }
3783       else if (c < 0xfc)
3784         {
3785           char_len = 5;
3786           if (c == 0xf8 && d < 0x88)
3787             return 0;
3788         }
3789       else if (c < 0xfe)
3790         {
3791           char_len = 6;
3792           if (c == 0xfc && d < 0x84)
3793             return 0;
3794         }
3795       else
3796         return 0;
3797
3798       if (str_idx + char_len > input->len)
3799         return 0;
3800
3801       for (i = 1; i < char_len; ++i)
3802         {
3803           d = re_string_byte_at (input, str_idx + i);
3804           if (d < 0x80 || d > 0xbf)
3805             return 0;
3806         }
3807       return char_len;
3808     }
3809
3810   char_len = re_string_char_size_at (input, str_idx);
3811   if (node->type == OP_PERIOD)
3812     {
3813       if (char_len <= 1)
3814         return 0;
3815       /* FIXME: I don't think this if is needed, as both '\n'
3816          and '\0' are char_len == 1.  */
3817       /* '.' accepts any one character except the following two cases.  */
3818       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3819            re_string_byte_at (input, str_idx) == '\n') ||
3820           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3821            re_string_byte_at (input, str_idx) == '\0'))
3822         return 0;
3823       return char_len;
3824     }
3825
3826   elem_len = re_string_elem_size_at (input, str_idx);
3827   wc = __btowc(*(input->mbs+str_idx));
3828   if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3829     return 0;
3830
3831   if (node->type == COMPLEX_BRACKET)
3832     {
3833       const re_charset_t *cset = node->opr.mbcset;
3834 # ifdef _LIBC
3835       const unsigned char *pin
3836         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3837       int j;
3838       uint32_t nrules;
3839 # endif /* _LIBC */
3840       int match_len = 0;
3841       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3842                     ? re_string_wchar_at (input, str_idx) : 0);
3843
3844       /* match with multibyte character?  */
3845       for (i = 0; i < cset->nmbchars; ++i)
3846         if (wc == cset->mbchars[i])
3847           {
3848             match_len = char_len;
3849             goto check_node_accept_bytes_match;
3850           }
3851       /* match with character_class?  */
3852       for (i = 0; i < cset->nchar_classes; ++i)
3853         {
3854           wctype_t wt = cset->char_classes[i];
3855           if (__iswctype (wc, wt))
3856             {
3857               match_len = char_len;
3858               goto check_node_accept_bytes_match;
3859             }
3860         }
3861
3862 # ifdef _LIBC
3863       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3864       if (nrules != 0)
3865         {
3866           unsigned int in_collseq = 0;
3867           const int32_t *table, *indirect;
3868           const unsigned char *weights, *extra;
3869           const char *collseqwc;
3870           /* This #include defines a local function!  */
3871 #  include <locale/weight.h>
3872
3873           /* match with collating_symbol?  */
3874           if (cset->ncoll_syms)
3875             extra = (const unsigned char *)
3876               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3877           for (i = 0; i < cset->ncoll_syms; ++i)
3878             {
3879               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3880               /* Compare the length of input collating element and
3881                  the length of current collating element.  */
3882               if (*coll_sym != elem_len)
3883                 continue;
3884               /* Compare each bytes.  */
3885               for (j = 0; j < *coll_sym; j++)
3886                 if (pin[j] != coll_sym[1 + j])
3887                   break;
3888               if (j == *coll_sym)
3889                 {
3890                   /* Match if every bytes is equal.  */
3891                   match_len = j;
3892                   goto check_node_accept_bytes_match;
3893                 }
3894             }
3895
3896           if (cset->nranges)
3897             {
3898               if (elem_len <= char_len)
3899                 {
3900                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3901                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3902                 }
3903               else
3904                 in_collseq = find_collation_sequence_value (pin, elem_len);
3905             }
3906           /* match with range expression?  */
3907           for (i = 0; i < cset->nranges; ++i)
3908             if (cset->range_starts[i] <= in_collseq
3909                 && in_collseq <= cset->range_ends[i])
3910               {
3911                 match_len = elem_len;
3912                 goto check_node_accept_bytes_match;
3913               }
3914
3915           /* match with equivalence_class?  */
3916           if (cset->nequiv_classes)
3917             {
3918               const unsigned char *cp = pin;
3919               table = (const int32_t *)
3920                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3921               weights = (const unsigned char *)
3922                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3923               extra = (const unsigned char *)
3924                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3925               indirect = (const int32_t *)
3926                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3927               int32_t idx = findidx (&cp);
3928               if (idx > 0)
3929                 for (i = 0; i < cset->nequiv_classes; ++i)
3930                   {
3931                     int32_t equiv_class_idx = cset->equiv_classes[i];
3932                     size_t weight_len = weights[idx & 0xffffff];
3933                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3934                         && (idx >> 24) == (equiv_class_idx >> 24))
3935                       {
3936                         int cnt = 0;
3937
3938                         idx &= 0xffffff;
3939                         equiv_class_idx &= 0xffffff;
3940
3941                         while (cnt <= weight_len
3942                                && (weights[equiv_class_idx + 1 + cnt]
3943                                    == weights[idx + 1 + cnt]))
3944                           ++cnt;
3945                         if (cnt > weight_len)
3946                           {
3947                             match_len = elem_len;
3948                             goto check_node_accept_bytes_match;
3949                           }
3950                       }
3951                   }
3952             }
3953         }
3954       else
3955 # endif /* _LIBC */
3956         {
3957           /* match with range expression?  */
3958 #if __GNUC__ >= 2
3959           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3960 #else
3961           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3962           cmp_buf[2] = wc;
3963 #endif
3964           for (i = 0; i < cset->nranges; ++i)
3965             {
3966               cmp_buf[0] = cset->range_starts[i];
3967               cmp_buf[4] = cset->range_ends[i];
3968               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3969                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3970                 {
3971                   match_len = char_len;
3972                   goto check_node_accept_bytes_match;
3973                 }
3974             }
3975         }
3976     check_node_accept_bytes_match:
3977       if (!cset->non_match)
3978         return match_len;
3979       else
3980         {
3981           if (match_len > 0)
3982             return 0;
3983           else
3984             return (elem_len > char_len) ? elem_len : char_len;
3985         }
3986     }
3987   return 0;
3988 }
3989
3990 # ifdef _LIBC
3991 static unsigned int
3992 internal_function
3993 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3994 {
3995   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3996   if (nrules == 0)
3997     {
3998       if (mbs_len == 1)
3999         {
4000           /* No valid character.  Match it as a single byte character.  */
4001           const unsigned char *collseq = (const unsigned char *)
4002             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4003           return collseq[mbs[0]];
4004         }
4005       return UINT_MAX;
4006     }
4007   else
4008     {
4009       int32_t idx;
4010       const unsigned char *extra = (const unsigned char *)
4011         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4012       int32_t extrasize = (const unsigned char *)
4013         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4014
4015       for (idx = 0; idx < extrasize;)
4016         {
4017           int mbs_cnt, found = 0;
4018           int32_t elem_mbs_len;
4019           /* Skip the name of collating element name.  */
4020           idx = idx + extra[idx] + 1;
4021           elem_mbs_len = extra[idx++];
4022           if (mbs_len == elem_mbs_len)
4023             {
4024               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4025                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4026                   break;
4027               if (mbs_cnt == elem_mbs_len)
4028                 /* Found the entry.  */
4029                 found = 1;
4030             }
4031           /* Skip the byte sequence of the collating element.  */
4032           idx += elem_mbs_len;
4033           /* Adjust for the alignment.  */
4034           idx = (idx + 3) & ~3;
4035           /* Skip the collation sequence value.  */
4036           idx += sizeof (uint32_t);
4037           /* Skip the wide char sequence of the collating element.  */
4038           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4039           /* If we found the entry, return the sequence value.  */
4040           if (found)
4041             return *(uint32_t *) (extra + idx);
4042           /* Skip the collation sequence value.  */
4043           idx += sizeof (uint32_t);
4044         }
4045       return UINT_MAX;
4046     }
4047 }
4048 # endif /* _LIBC */
4049 #endif /* RE_ENABLE_I18N */
4050
4051 /* Check whether the node accepts the byte which is IDX-th
4052    byte of the INPUT.  */
4053
4054 static int
4055 internal_function
4056 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4057                    int idx)
4058 {
4059   unsigned char ch;
4060   ch = re_string_byte_at (&mctx->input, idx);
4061   switch (node->type)
4062     {
4063     case CHARACTER:
4064       if (node->opr.c != ch)
4065         return 0;
4066       break;
4067
4068     case SIMPLE_BRACKET:
4069       if (!bitset_contain (node->opr.sbcset, ch))
4070         return 0;
4071       break;
4072
4073 #ifdef RE_ENABLE_I18N
4074     case OP_UTF8_PERIOD:
4075       if (ch >= 0x80)
4076         return 0;
4077       /* FALLTHROUGH */
4078 #endif
4079     case OP_PERIOD:
4080       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4081           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4082         return 0;
4083       break;
4084
4085     default:
4086       return 0;
4087     }
4088
4089   if (node->constraint)
4090     {
4091       /* The node has constraints.  Check whether the current context
4092          satisfies the constraints.  */
4093       unsigned int context = re_string_context_at (&mctx->input, idx,
4094                                                    mctx->eflags);
4095       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4096         return 0;
4097     }
4098
4099   return 1;
4100 }
4101
4102 /* Extend the buffers, if the buffers have run out.  */
4103
4104 static reg_errcode_t
4105 internal_function
4106 extend_buffers (re_match_context_t *mctx)
4107 {
4108   reg_errcode_t ret;
4109   re_string_t *pstr = &mctx->input;
4110
4111   /* Avoid overflow.  */
4112   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4113     return REG_ESPACE;
4114
4115   /* Double the lengthes of the buffers.  */
4116   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4117   if (BE (ret != REG_NOERROR, 0))
4118     return ret;
4119
4120   if (mctx->state_log != NULL)
4121     {
4122       /* And double the length of state_log.  */
4123       /* XXX We have no indication of the size of this buffer.  If this
4124          allocation fail we have no indication that the state_log array
4125          does not have the right size.  */
4126       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4127                                               pstr->bufs_len + 1);
4128       if (BE (new_array == NULL, 0))
4129         return REG_ESPACE;
4130       mctx->state_log = new_array;
4131     }
4132
4133   /* Then reconstruct the buffers.  */
4134   if (pstr->icase)
4135     {
4136 #ifdef RE_ENABLE_I18N
4137       if (pstr->mb_cur_max > 1)
4138         {
4139           ret = build_wcs_upper_buffer (pstr);
4140           if (BE (ret != REG_NOERROR, 0))
4141             return ret;
4142         }
4143       else
4144 #endif /* RE_ENABLE_I18N  */
4145         build_upper_buffer (pstr);
4146     }
4147   else
4148     {
4149 #ifdef RE_ENABLE_I18N
4150       if (pstr->mb_cur_max > 1)
4151         build_wcs_buffer (pstr);
4152       else
4153 #endif /* RE_ENABLE_I18N  */
4154         {
4155           if (pstr->trans != NULL)
4156             re_string_translate_buffer (pstr);
4157         }
4158     }
4159   return REG_NOERROR;
4160 }
4161
4162 \f
4163 /* Functions for matching context.  */
4164
4165 /* Initialize MCTX.  */
4166
4167 static reg_errcode_t
4168 internal_function
4169 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4170 {
4171   mctx->eflags = eflags;
4172   mctx->match_last = -1;
4173   if (n > 0)
4174     {
4175       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4176       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4177       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4178         return REG_ESPACE;
4179     }
4180   /* Already zero-ed by the caller.
4181      else
4182        mctx->bkref_ents = NULL;
4183      mctx->nbkref_ents = 0;
4184      mctx->nsub_tops = 0;  */
4185   mctx->abkref_ents = n;
4186   mctx->max_mb_elem_len = 1;
4187   mctx->asub_tops = n;
4188   return REG_NOERROR;
4189 }
4190
4191 /* Clean the entries which depend on the current input in MCTX.
4192    This function must be invoked when the matcher changes the start index
4193    of the input, or changes the input string.  */
4194
4195 static void
4196 internal_function
4197 match_ctx_clean (re_match_context_t *mctx)
4198 {
4199   int st_idx;
4200   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4201     {
4202       int sl_idx;
4203       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4204       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4205         {
4206           re_sub_match_last_t *last = top->lasts[sl_idx];
4207           re_free (last->path.array);
4208           re_free (last);
4209         }
4210       re_free (top->lasts);
4211       if (top->path)
4212         {
4213           re_free (top->path->array);
4214           re_free (top->path);
4215         }
4216       free (top);
4217     }
4218
4219   mctx->nsub_tops = 0;
4220   mctx->nbkref_ents = 0;
4221 }
4222
4223 /* Free all the memory associated with MCTX.  */
4224
4225 static void
4226 internal_function
4227 match_ctx_free (re_match_context_t *mctx)
4228 {
4229   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4230   match_ctx_clean (mctx);
4231   re_free (mctx->sub_tops);
4232   re_free (mctx->bkref_ents);
4233 }
4234
4235 /* Add a new backreference entry to MCTX.
4236    Note that we assume that caller never call this function with duplicate
4237    entry, and call with STR_IDX which isn't smaller than any existing entry.
4238 */
4239
4240 static reg_errcode_t
4241 internal_function
4242 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4243                      int to)
4244 {
4245   if (mctx->nbkref_ents >= mctx->abkref_ents)
4246     {
4247       struct re_backref_cache_entry* new_entry;
4248       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4249                               mctx->abkref_ents * 2);
4250       if (BE (new_entry == NULL, 0))
4251         {
4252           re_free (mctx->bkref_ents);
4253           return REG_ESPACE;
4254         }
4255       mctx->bkref_ents = new_entry;
4256       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4257               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4258       mctx->abkref_ents *= 2;
4259     }
4260   if (mctx->nbkref_ents > 0
4261       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4262     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4263
4264   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4265   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4266   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4267   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4268
4269   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4270      If bit N is clear, means that this entry won't epsilon-transition to
4271      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4272      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4273      such node.
4274
4275      A backreference does not epsilon-transition unless it is empty, so set
4276      to all zeros if FROM != TO.  */
4277   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4278     = (from == to ? ~0 : 0);
4279
4280   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4281   if (mctx->max_mb_elem_len < to - from)
4282     mctx->max_mb_elem_len = to - from;
4283   return REG_NOERROR;
4284 }
4285
4286 /* Search for the first entry which has the same str_idx, or -1 if none is
4287    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4288
4289 static int
4290 internal_function
4291 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4292 {
4293   int left, right, mid, last;
4294   last = right = mctx->nbkref_ents;
4295   for (left = 0; left < right;)
4296     {
4297       mid = (left + right) / 2;
4298       if (mctx->bkref_ents[mid].str_idx < str_idx)
4299         left = mid + 1;
4300       else
4301         right = mid;
4302     }
4303   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4304     return left;
4305   else
4306     return -1;
4307 }
4308
4309 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4310    at STR_IDX.  */
4311
4312 static reg_errcode_t
4313 internal_function
4314 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4315 {
4316 #ifdef DEBUG
4317   assert (mctx->sub_tops != NULL);
4318   assert (mctx->asub_tops > 0);
4319 #endif
4320   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4321     {
4322       int new_asub_tops = mctx->asub_tops * 2;
4323       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4324                                                    re_sub_match_top_t *,
4325                                                    new_asub_tops);
4326       if (BE (new_array == NULL, 0))
4327         return REG_ESPACE;
4328       mctx->sub_tops = new_array;
4329       mctx->asub_tops = new_asub_tops;
4330     }
4331   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4332   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4333     return REG_ESPACE;
4334   mctx->sub_tops[mctx->nsub_tops]->node = node;
4335   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4336   return REG_NOERROR;
4337 }
4338
4339 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4340    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4341
4342 static re_sub_match_last_t *
4343 internal_function
4344 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4345 {
4346   re_sub_match_last_t *new_entry;
4347   if (BE (subtop->nlasts == subtop->alasts, 0))
4348     {
4349       int new_alasts = 2 * subtop->alasts + 1;
4350       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4351                                                     re_sub_match_last_t *,
4352                                                     new_alasts);
4353       if (BE (new_array == NULL, 0))
4354         return NULL;
4355       subtop->lasts = new_array;
4356       subtop->alasts = new_alasts;
4357     }
4358   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4359   if (BE (new_entry != NULL, 1))
4360     {
4361       subtop->lasts[subtop->nlasts] = new_entry;
4362       new_entry->node = node;
4363       new_entry->str_idx = str_idx;
4364       ++subtop->nlasts;
4365     }
4366   return new_entry;
4367 }
4368
4369 static void
4370 internal_function
4371 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4372                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4373 {
4374   sctx->sifted_states = sifted_sts;
4375   sctx->limited_states = limited_sts;
4376   sctx->last_node = last_node;
4377   sctx->last_str_idx = last_str_idx;
4378   re_node_set_init_empty (&sctx->limits);
4379 }